rsstube/scripts/extractors/tumblr.py

#!/usr/bin/python3

from utils import *
from download_page import download

# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
	platform = platform[:(-3)]

def extract_from_url (url, verbosity):
	# split into domain and path
	index = url.find("/",url.find("//")+2)
	if index == -1:
		domain = url
		path = "/"
	else:
		domain = url[:index]
		path = url[index:]

	primary_domain = {
		"https://tumblr.com",
		"http://tumblr.com",
		"https://www.tumblr.com",
		"http://www.tumblr.com"
	}
	if domain in primary_domain:
		# only handle blogs on other subdomains
		return None

	for page_type in ["tagged", "search"]:
		page_type_with_slashes = "/" + page_type + "/"
		if path.startswith(page_type_with_slashes):
			offset = len(page_type_with_slashes)
			tag_end = path.find('/', offset)

			if tag_end < 0:
				# no trailing slash, go to end
				tag = path[offset:]
			else:
				tag = path[offset:tag_end]

			if tag:
				return domain + page_type_with_slashes + tag + "/rss"

	# if we've reached this point, just return overall blog feed
	return domain + "/rss"

def extract_from_page (page, verbosity):
	# this method should not be called
	return search (page, '<link rel="alternate" type="application/rss+xml" href="', '">')

def extract (url, page=None, network=False, verbosity=3, args={}):
	feed = extract_from_url (url, verbosity)
	if not feed is None:
		return feed
	else:
		notify ("Unable to get feed from URL alone", verbosity, platform)
		if network == True:
			page = download (platform, url, args, verbosity)
			feed = extract_from_page (page, verbosity)
			if not feed is None:
				return feed
Initial code push. This version of rsstube works but is not complete. 2021-07-22 20:00:00 -04:00			`#!/usr/bin/python3`

			`from utils import *`
			`from download_page import download`

			`# portable code to get filename`
			`import os`
			`platform = os.path.basename(__file__)`
			`if platform.endswith(".py"):`
			`platform = platform[:(-3)]`

			`def extract_from_url (url, verbosity):`
			`# split into domain and path`
			`index = url.find("/",url.find("//")+2)`
Fix other instances of off-by-one error. 2021-07-23 20:00:00 -04:00			`if index == -1:`
			`domain = url`
			`path = "/"`
			`else:`
			`domain = url[:index]`
			`path = url[index:]`
Initial code push. This version of rsstube works but is not complete. 2021-07-22 20:00:00 -04:00
			`primary_domain = {`
			`"https://tumblr.com",`
			`"http://tumblr.com",`
			`"https://www.tumblr.com",`
			`"http://www.tumblr.com"`
			`}`
			`if domain in primary_domain:`
			`# only handle blogs on other subdomains`
			`return None`

			`for page_type in ["tagged", "search"]:`
			`page_type_with_slashes = "/" + page_type + "/"`
			`if path.startswith(page_type_with_slashes):`
			`offset = len(page_type_with_slashes)`
			`tag_end = path.find('/', offset)`

			`if tag_end < 0:`
			`# no trailing slash, go to end`
			`tag = path[offset:]`
			`else:`
			`tag = path[offset:tag_end]`

			`if tag:`
			`return domain + page_type_with_slashes + tag + "/rss"`

			`# if we've reached this point, just return overall blog feed`
			`return domain + "/rss"`

			`def extract_from_page (page, verbosity):`
			`# this method should not be called`
			`return search (page, '<link rel="alternate" type="application/rss+xml" href="', '">')`

			`def extract (url, page=None, network=False, verbosity=3, args={}):`
			`feed = extract_from_url (url, verbosity)`
			`if not feed is None:`
			`return feed`
			`else:`
			`notify ("Unable to get feed from URL alone", verbosity, platform)`
			`if network == True:`
			`page = download (platform, url, args, verbosity)`
			`feed = extract_from_page (page, verbosity)`
			`if not feed is None:`
			`return feed`