rsstube/scripts/extractors/generic.py

134 lines
3.6 KiB
Python

#!/usr/bin/python3
from utils import *
from download_page import download
# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
platform = platform[:(-3)]
def try_common_paths (verbosity, url, args):
debug ("Trying common paths for " + url + "...", verbosity, platform)
# strip extra arguments at end of URL
for symbol in ["?", "&", ";", "#"]:
if symbol in url:
url = url[:url.index(symbol)]
# strip trailing slash (if applicable)
if url.endswith("/"):
url = url[:(-1)]
common_paths = {
"atom",
"atom.xml",
"feed",
"feed.atom",
"feed.rss",
"feed.xml",
"rss",
"rss.xml"
}
for path in common_paths:
page,response_code = download (platform, url + '/' + path, args, verbosity, True)
if response_code == 200:
# TODO: verify it is a valid RSS feed
# Some pages serve response 200 for invalid pages
# assume we found a feed
return url + '/' + path
# failed to find
return None
def extract_from_page (page, verbosity, url, args):
# Pages often include links like <link rel="alternate" type="application/rss+xml" title="My Blog's Feed" href="https://example.com/feed/" />
# We want to start with more specific so we can be more confident in the results, but we'll try to figure it out...
# Sometimes these include multiple entries. We want the first one because it's usually the correct one. Other entries might be comments, etc.
delimeters = ['"', "'", '']
feed_types = ["rss", "atom"]
plus_signs = ['+', "&#43;", "&#x2B;"]
link_formats = []
for delimeter in delimeters:
for feed_type in feed_types:
for plus_sign in plus_signs:
format = "type=" + delimeter + "application/" + feed_type + plus_sign + "xml" + delimeter
before = search (page, '<', format, reverse=True)
after = search (page, format, '>')
# if one is not None, we may get the feed
if before is None and after is None:
continue
# let us safely mess with these strings
if before is None:
before = ""
if after is None:
after = ""
string_to_search = None
if "href=" in before:
string_to_search = before
elif "href=" in after:
string_to_search = after
if not string_to_search is None:
for delimeter in delimeters:
result = search (string_to_search, 'href=' + delimeter, delimeter)
if not result is None:
return result
result = try_common_paths (verbosity, url, args)
if not result is None:
return result
debug ("Failed to find from page. Let's try higher-level pages.", verbosity, platform)
# split into domain and path
index = url.find("/",url.find("//")+2)
if (index == -1):
domain = url
path = "/"
else:
domain = url[:index]
path = url[index:]
if path.startswith("/@") or path.startswith("/~"):
offset = 3
elif path.startswith("/user/"):
offset = 7
elif path.startswith("/users/"):
offset = 8
else:
offset = 1
# find first slash after offset (if present)
index = path.find('/',offset)
if index > -1:
path = path[:index+1]
else:
path = '/'
# we don't want to infinitely recurse
if domain + path == url:
return
# try with higher level
page = download (platform, domain + path, args, verbosity)
notify ("Trying " + domain + path + " with generic extractor...", verbosity, platform)
return extract_from_page(page, verbosity, domain + path, args)
def extract (url, page=None, network=False, verbosity=3, args={}):
if network == True:
if page is None:
page = download (platform, url, args, verbosity)
feed = extract_from_page (page, verbosity, url, args)
if not feed is None:
if feed.startswith("/"):
domain = url[:url.find("/",url.find("//")+2)]
feed = domain + feed
return feed