65 lines
1.6 KiB
Python
65 lines
1.6 KiB
Python
#!/usr/bin/python3
|
|
|
|
from utils import *
|
|
from download_page import download
|
|
|
|
# portable code to get filename
|
|
import os
|
|
platform = os.path.basename(__file__)
|
|
if platform.endswith(".py"):
|
|
platform = platform[:(-3)]
|
|
|
|
def extract_from_url (url, verbosity):
|
|
# split into domain and path
|
|
index = url.find("/",url.find("//")+2)
|
|
if index == -1:
|
|
domain = url
|
|
path = "/"
|
|
else:
|
|
domain = url[:index]
|
|
path = url[index:]
|
|
|
|
primary_domain = {
|
|
"https://tumblr.com",
|
|
"http://tumblr.com",
|
|
"https://www.tumblr.com",
|
|
"http://www.tumblr.com"
|
|
}
|
|
if domain in primary_domain:
|
|
# only handle blogs on other subdomains
|
|
return None
|
|
|
|
for page_type in ["tagged", "search"]:
|
|
page_type_with_slashes = "/" + page_type + "/"
|
|
if path.startswith(page_type_with_slashes):
|
|
offset = len(page_type_with_slashes)
|
|
tag_end = path.find('/', offset)
|
|
|
|
if tag_end < 0:
|
|
# no trailing slash, go to end
|
|
tag = path[offset:]
|
|
else:
|
|
tag = path[offset:tag_end]
|
|
|
|
if tag:
|
|
return domain + page_type_with_slashes + tag + "/rss"
|
|
|
|
# if we've reached this point, just return overall blog feed
|
|
return domain + "/rss"
|
|
|
|
def extract_from_page (page, verbosity):
|
|
# this method should not be called
|
|
return search (page, '<link rel="alternate" type="application/rss+xml" href="', '">')
|
|
|
|
def extract (url, page=None, network=False, verbosity=3, args={}):
|
|
feed = extract_from_url (url, verbosity)
|
|
if not feed is None:
|
|
return feed
|
|
else:
|
|
notify ("Unable to get feed from URL alone", verbosity, platform)
|
|
if network == True:
|
|
page = download (platform, url, args, verbosity)
|
|
feed = extract_from_page (page, verbosity)
|
|
if not feed is None:
|
|
return feed
|