#!/usr/bin/python3 from utils import * from download_page import download # portable code to get filename import os platform = os.path.basename(__file__) if platform.endswith(".py"): platform = platform[:(-3)] def try_common_paths (verbosity, url, args): debug ("Trying common paths for " + url + "...", verbosity, platform) # strip extra arguments at end of URL for symbol in ["?", "&", ";", "#"]: if symbol in url: url = url[:url.index(symbol)] # strip trailing slash (if applicable) if url.endswith("/"): url = url[:(-1)] common_paths = { "atom", "atom.xml", "feed", "feed.atom", "feed.rss", "feed.xml", "rss", "rss.xml" } for path in common_paths: page,response_code = download (platform, url + '/' + path, args, verbosity, True) if response_code == 200: # TODO: verify it is a valid RSS feed # Some pages serve response 200 for invalid pages # assume we found a feed return url + '/' + path # failed to find return None def extract_from_page (page, verbosity, url, args): # Pages often include links like # We want to start with more specific so we can be more confident in the results, but we'll try to figure it out... # Sometimes these include multiple entries. We want the first one because it's usually the correct one. Other entries might be comments, etc. delimeters = ['"', "'", ''] feed_types = ["rss", "atom"] plus_signs = ['+', "+", "+"] link_formats = [] for delimeter in delimeters: for feed_type in feed_types: for plus_sign in plus_signs: format = "type=" + delimeter + "application/" + feed_type + plus_sign + "xml" + delimeter before = search (page, '<', format, reverse=True) after = search (page, format, '>') # if one is not None, we may get the feed if before is None and after is None: continue # let us safely mess with these strings if before is None: before = "" if after is None: after = "" string_to_search = None if "href=" in before: string_to_search = before elif "href=" in after: string_to_search = after if not string_to_search is None: for delimeter in delimeters: result = search (string_to_search, 'href=' + delimeter, delimeter) if not result is None: return result result = try_common_paths (verbosity, url, args) if not result is None: return result debug ("Failed to find from page. Let's try higher-level pages.", verbosity, platform) # split into domain and path index = url.find("/",url.find("//")+2) if (index == -1): domain = url path = "/" else: domain = url[:index] path = url[index:] if path.startswith("/@") or path.startswith("/~"): offset = 3 elif path.startswith("/user/"): offset = 7 elif path.startswith("/users/"): offset = 8 else: offset = 1 # find first slash after offset (if present) index = path.find('/',offset) if index > -1: path = path[:index+1] else: path = '/' # we don't want to infinitely recurse if domain + path == url: return # try with higher level page = download (platform, domain + path, args, verbosity) notify ("Trying " + domain + path + " with generic extractor...", verbosity, platform) return extract_from_page(page, verbosity, domain + path, args) def extract (url, page=None, network=False, verbosity=3, args={}): if network == True: if page is None: page = download (platform, url, args, verbosity) feed = extract_from_page (page, verbosity, url, args) if not feed is None: if feed.startswith("/"): index = url.find("/",url.find("//")+2) if index == -1: domain = url else: domain = url[:index] feed = domain + feed return feed