2021-07-22 20:00:00 -04:00
|
|
|
#!/usr/bin/python3
|
|
|
|
|
|
|
|
from utils import *
|
|
|
|
from download_page import download
|
|
|
|
|
|
|
|
# portable code to get filename
|
|
|
|
import os
|
|
|
|
platform = os.path.basename(__file__)
|
|
|
|
if platform.endswith(".py"):
|
|
|
|
platform = platform[:(-3)]
|
|
|
|
|
|
|
|
def try_common_paths (verbosity, url, args):
|
|
|
|
debug ("Trying common paths for " + url + "...", verbosity, platform)
|
|
|
|
|
|
|
|
# strip extra arguments at end of URL
|
|
|
|
for symbol in ["?", "&", ";", "#"]:
|
|
|
|
if symbol in url:
|
|
|
|
url = url[:url.index(symbol)]
|
|
|
|
|
|
|
|
# strip trailing slash (if applicable)
|
|
|
|
if url.endswith("/"):
|
|
|
|
url = url[:(-1)]
|
|
|
|
|
|
|
|
common_paths = {
|
|
|
|
"atom",
|
|
|
|
"atom.xml",
|
|
|
|
"feed",
|
|
|
|
"feed.atom",
|
|
|
|
"feed.rss",
|
|
|
|
"feed.xml",
|
|
|
|
"rss",
|
|
|
|
"rss.xml"
|
|
|
|
}
|
|
|
|
|
|
|
|
for path in common_paths:
|
|
|
|
page,response_code = download (platform, url + '/' + path, args, verbosity, True)
|
|
|
|
if response_code == 200:
|
|
|
|
# TODO: verify it is a valid RSS feed
|
|
|
|
# Some pages serve response 200 for invalid pages
|
|
|
|
|
|
|
|
# assume we found a feed
|
|
|
|
return url + '/' + path
|
|
|
|
|
|
|
|
# failed to find
|
|
|
|
return None
|
|
|
|
|
|
|
|
def extract_from_page (page, verbosity, url, args):
|
|
|
|
|
|
|
|
# Pages often include links like <link rel="alternate" type="application/rss+xml" title="My Blog's Feed" href="https://example.com/feed/" />
|
|
|
|
# We want to start with more specific so we can be more confident in the results, but we'll try to figure it out...
|
|
|
|
# Sometimes these include multiple entries. We want the first one because it's usually the correct one. Other entries might be comments, etc.
|
|
|
|
delimeters = ['"', "'", '']
|
|
|
|
feed_types = ["rss", "atom"]
|
|
|
|
plus_signs = ['+', "+", "+"]
|
|
|
|
link_formats = []
|
|
|
|
for delimeter in delimeters:
|
|
|
|
for feed_type in feed_types:
|
|
|
|
for plus_sign in plus_signs:
|
|
|
|
format = "type=" + delimeter + "application/" + feed_type + plus_sign + "xml" + delimeter
|
|
|
|
before = search (page, '<', format, reverse=True)
|
|
|
|
after = search (page, format, '>')
|
|
|
|
|
|
|
|
# if one is not None, we may get the feed
|
|
|
|
if before is None and after is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# let us safely mess with these strings
|
|
|
|
if before is None:
|
|
|
|
before = ""
|
|
|
|
if after is None:
|
|
|
|
after = ""
|
|
|
|
|
|
|
|
string_to_search = None
|
|
|
|
if "href=" in before:
|
|
|
|
string_to_search = before
|
|
|
|
elif "href=" in after:
|
|
|
|
string_to_search = after
|
|
|
|
if not string_to_search is None:
|
|
|
|
for delimeter in delimeters:
|
|
|
|
result = search (string_to_search, 'href=' + delimeter, delimeter)
|
|
|
|
if not result is None:
|
|
|
|
return result
|
|
|
|
|
|
|
|
result = try_common_paths (verbosity, url, args)
|
|
|
|
if not result is None:
|
|
|
|
return result
|
|
|
|
|
|
|
|
debug ("Failed to find from page. Let's try higher-level pages.", verbosity, platform)
|
|
|
|
|
|
|
|
# split into domain and path
|
|
|
|
index = url.find("/",url.find("//")+2)
|
2021-07-22 20:00:00 -04:00
|
|
|
if (index == -1):
|
|
|
|
domain = url
|
|
|
|
path = "/"
|
|
|
|
else:
|
|
|
|
domain = url[:index]
|
|
|
|
path = url[index:]
|
2021-07-22 20:00:00 -04:00
|
|
|
|
|
|
|
if path.startswith("/@") or path.startswith("/~"):
|
|
|
|
offset = 3
|
|
|
|
elif path.startswith("/user/"):
|
|
|
|
offset = 7
|
|
|
|
elif path.startswith("/users/"):
|
|
|
|
offset = 8
|
|
|
|
else:
|
|
|
|
offset = 1
|
|
|
|
|
|
|
|
# find first slash after offset (if present)
|
|
|
|
index = path.find('/',offset)
|
|
|
|
if index > -1:
|
|
|
|
path = path[:index+1]
|
|
|
|
else:
|
|
|
|
path = '/'
|
|
|
|
|
|
|
|
# we don't want to infinitely recurse
|
|
|
|
if domain + path == url:
|
|
|
|
return
|
|
|
|
|
|
|
|
# try with higher level
|
|
|
|
page = download (platform, domain + path, args, verbosity)
|
|
|
|
notify ("Trying " + domain + path + " with generic extractor...", verbosity, platform)
|
|
|
|
return extract_from_page(page, verbosity, domain + path, args)
|
|
|
|
|
|
|
|
def extract (url, page=None, network=False, verbosity=3, args={}):
|
|
|
|
if network == True:
|
|
|
|
if page is None:
|
|
|
|
page = download (platform, url, args, verbosity)
|
|
|
|
feed = extract_from_page (page, verbosity, url, args)
|
|
|
|
if not feed is None:
|
|
|
|
if feed.startswith("/"):
|
2021-07-23 20:00:00 -04:00
|
|
|
index = url.find("/",url.find("//")+2)
|
|
|
|
if index == -1:
|
|
|
|
domain = url
|
|
|
|
else:
|
|
|
|
domain = url[:index]
|
2021-07-22 20:00:00 -04:00
|
|
|
feed = domain + feed
|
|
|
|
return feed
|