rsstube/scripts/extractors/tumblr.py

65 lines
1.6 KiB
Python
Raw Normal View History

#!/usr/bin/python3
from utils import *
from download_page import download
# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
platform = platform[:(-3)]
def extract_from_url (url, verbosity):
# split into domain and path
index = url.find("/",url.find("//")+2)
if index == -1:
domain = url
path = "/"
else:
domain = url[:index]
path = url[index:]
primary_domain = {
"https://tumblr.com",
"http://tumblr.com",
"https://www.tumblr.com",
"http://www.tumblr.com"
}
if domain in primary_domain:
# only handle blogs on other subdomains
return None
for page_type in ["tagged", "search"]:
page_type_with_slashes = "/" + page_type + "/"
if path.startswith(page_type_with_slashes):
offset = len(page_type_with_slashes)
tag_end = path.find('/', offset)
if tag_end < 0:
# no trailing slash, go to end
tag = path[offset:]
else:
tag = path[offset:tag_end]
if tag:
return domain + page_type_with_slashes + tag + "/rss"
# if we've reached this point, just return overall blog feed
return domain + "/rss"
def extract_from_page (page, verbosity):
# this method should not be called
return search (page, '<link rel="alternate" type="application/rss+xml" href="', '">')
def extract (url, page=None, network=False, verbosity=3, args={}):
feed = extract_from_url (url, verbosity)
if not feed is None:
return feed
else:
notify ("Unable to get feed from URL alone", verbosity, platform)
if network == True:
page = download (platform, url, args, verbosity)
feed = extract_from_page (page, verbosity)
if not feed is None:
return feed