rsstube/scripts/extractors/generic.py

#!/usr/bin/python3

from utils import *
from download_page import download

# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
	platform = platform[:(-3)]

def try_common_paths (verbosity, url, args):
	debug ("Trying common paths for " + url + "...", verbosity, platform)

	# strip extra arguments at end of URL
	for symbol in ["?", "&", ";", "#"]:
		if symbol in url:
			url = url[:url.index(symbol)]

	# strip trailing slash (if applicable)
	if url.endswith("/"):
		url = url[:(-1)]

	common_paths = {
		"atom",
		"atom.xml",
		"feed",
		"feed.atom",
		"feed.rss",
		"feed.xml",
		"rss",
		"rss.xml"
	}

	for path in common_paths:
		page,response_code = download (platform, url + '/' + path, args, verbosity, True)
		if response_code == 200:
			# TODO: verify it is a valid RSS feed
			# Some pages serve response 200 for invalid pages

			# assume we found a feed
			return url + '/' + path

	# failed to find
	return None

def extract_from_page (page, verbosity, url, args):

	# Pages often include links like <link rel="alternate" type="application/rss+xml" title="My Blog's Feed" href="https://example.com/feed/" />
	# We want to start with more specific so we can be more confident in the results, but we'll try to figure it out...
	# Sometimes these include multiple entries. We want the first one because it's usually the correct one. Other entries might be comments, etc.
	delimeters = ['"', "'", '']
	feed_types = ["rss", "atom"]
	plus_signs = ['+', "&#43;", "&#x2B;"]
	link_formats = []
	for delimeter in delimeters:
		for feed_type in feed_types:
			for plus_sign in plus_signs:
				format = "type=" + delimeter + "application/" + feed_type + plus_sign + "xml" + delimeter
				before = search (page, '<', format, reverse=True)
				after = search (page, format, '>')

				# if one is not None, we may get the feed
				if before is None and after is None:
					continue

				# let us safely mess with these strings
				if before is None:
					before = ""
				if after is None:
					after = ""

				string_to_search = None
				if "href=" in before:
					string_to_search = before
				elif "href=" in after:
					string_to_search = after
				if not string_to_search is None:
					for delimeter in delimeters:
						result = search (string_to_search, 'href=' + delimeter, delimeter)
						if not result is None:
							return result

	result = try_common_paths (verbosity, url, args)
	if not result is None:
		return result

	debug ("Failed to find from page. Let's try higher-level pages.", verbosity, platform)

	# split into domain and path
	index = url.find("/",url.find("//")+2)
	if (index == -1):
		domain = url
		path = "/"
	else:
		domain = url[:index]
		path = url[index:]

	if path.startswith("/@") or path.startswith("/~"):
		offset = 3
	elif path.startswith("/user/"):
		offset = 7
	elif path.startswith("/users/"):
		offset = 8
	else:
		offset = 1

	# find first slash after offset (if present)
	index = path.find('/',offset)
	if index > -1:
		path = path[:index+1]
	else:
		path = '/'

	# we don't want to infinitely recurse
	if domain + path == url:
		return

	# try with higher level
	page = download (platform, domain + path, args, verbosity)
	notify ("Trying " + domain + path + " with generic extractor...", verbosity, platform)
	return extract_from_page(page, verbosity, domain + path, args)

def extract (url, page=None, network=False, verbosity=3, args={}):
	if network == True:
		if page is None:
			page = download (platform, url, args, verbosity)
		feed = extract_from_page (page, verbosity, url, args)
		if not feed is None:
			if feed.startswith("/"):
				index = url.find("/",url.find("//")+2)
				if index == -1:
					domain = url
				else:
					domain = url[:index]
				feed = domain + feed
			return feed
Initial code push. This version of rsstube works but is not complete. 2021-07-22 20:00:00 -04:00			`#!/usr/bin/python3`

			`from utils import *`
			`from download_page import download`

			`# portable code to get filename`
			`import os`
			`platform = os.path.basename(__file__)`
			`if platform.endswith(".py"):`
			`platform = platform[:(-3)]`

			`def try_common_paths (verbosity, url, args):`
			`debug ("Trying common paths for " + url + "...", verbosity, platform)`

			`# strip extra arguments at end of URL`
			`for symbol in ["?", "&", ";", "#"]:`
			`if symbol in url:`
			`url = url[:url.index(symbol)]`

			`# strip trailing slash (if applicable)`
			`if url.endswith("/"):`
			`url = url[:(-1)]`

			`common_paths = {`
			`"atom",`
			`"atom.xml",`
			`"feed",`
			`"feed.atom",`
			`"feed.rss",`
			`"feed.xml",`
			`"rss",`
			`"rss.xml"`
			`}`

			`for path in common_paths:`
			`page,response_code = download (platform, url + '/' + path, args, verbosity, True)`
			`if response_code == 200:`
			`# TODO: verify it is a valid RSS feed`
			`# Some pages serve response 200 for invalid pages`

			`# assume we found a feed`
			`return url + '/' + path`

			`# failed to find`
			`return None`

			`def extract_from_page (page, verbosity, url, args):`

			`# Pages often include links like <link rel="alternate" type="application/rss+xml" title="My Blog's Feed" href="https://example.com/feed/" />`
			`# We want to start with more specific so we can be more confident in the results, but we'll try to figure it out...`
			`# Sometimes these include multiple entries. We want the first one because it's usually the correct one. Other entries might be comments, etc.`
			`delimeters = ['"', "'", '']`
			`feed_types = ["rss", "atom"]`
			`plus_signs = ['+', "+", "+"]`
			`link_formats = []`
			`for delimeter in delimeters:`
			`for feed_type in feed_types:`
			`for plus_sign in plus_signs:`
			`format = "type=" + delimeter + "application/" + feed_type + plus_sign + "xml" + delimeter`
			`before = search (page, '<', format, reverse=True)`
			`after = search (page, format, '>')`

			`# if one is not None, we may get the feed`
			`if before is None and after is None:`
			`continue`

			`# let us safely mess with these strings`
			`if before is None:`
			`before = ""`
			`if after is None:`
			`after = ""`

			`string_to_search = None`
			`if "href=" in before:`
			`string_to_search = before`
			`elif "href=" in after:`
			`string_to_search = after`
			`if not string_to_search is None:`
			`for delimeter in delimeters:`
			`result = search (string_to_search, 'href=' + delimeter, delimeter)`
			`if not result is None:`
			`return result`

			`result = try_common_paths (verbosity, url, args)`
			`if not result is None:`
			`return result`

			`debug ("Failed to find from page. Let's try higher-level pages.", verbosity, platform)`

			`# split into domain and path`
			`index = url.find("/",url.find("//")+2)`
Fix unchecked fail-to-find causing off-by-one error. 2021-07-22 20:00:00 -04:00			`if (index == -1):`
			`domain = url`
			`path = "/"`
			`else:`
			`domain = url[:index]`
			`path = url[index:]`
Initial code push. This version of rsstube works but is not complete. 2021-07-22 20:00:00 -04:00
			`if path.startswith("/@") or path.startswith("/~"):`
			`offset = 3`
			`elif path.startswith("/user/"):`
			`offset = 7`
			`elif path.startswith("/users/"):`
			`offset = 8`
			`else:`
			`offset = 1`

			`# find first slash after offset (if present)`
			`index = path.find('/',offset)`
			`if index > -1:`
			`path = path[:index+1]`
			`else:`
			`path = '/'`

			`# we don't want to infinitely recurse`
			`if domain + path == url:`
			`return`

			`# try with higher level`
			`page = download (platform, domain + path, args, verbosity)`
			`notify ("Trying " + domain + path + " with generic extractor...", verbosity, platform)`
			`return extract_from_page(page, verbosity, domain + path, args)`

			`def extract (url, page=None, network=False, verbosity=3, args={}):`
			`if network == True:`
			`if page is None:`
			`page = download (platform, url, args, verbosity)`
			`feed = extract_from_page (page, verbosity, url, args)`
			`if not feed is None:`
			`if feed.startswith("/"):`
Fix other instances of off-by-one error. 2021-07-23 20:00:00 -04:00			`index = url.find("/",url.find("//")+2)`
			`if index == -1:`
			`domain = url`
			`else:`
			`domain = url[:index]`
Initial code push. This version of rsstube works but is not complete. 2021-07-22 20:00:00 -04:00			`feed = domain + feed`
			`return feed`