rsstube/scripts/extractors/generic.py

#!/usr/bin/python3

from utils import *
from download_page import download

# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
	platform = platform[:(-3)]

def try_common_paths (verbosity, url, args):
	debug ("Trying common paths for " + url + "...", verbosity, platform)

	# strip extra arguments at end of URL
	for symbol in ["?", "&", ";", "#"]:
		if symbol in url:
			url = url[:url.index(symbol)]

	# strip trailing slash (if applicable)
	if url.endswith("/"):
		url = url[:(-1)]

	common_paths = {
		"atom",
		"atom.xml",
		"feed",
		"feed.atom",
		"feed.rss",
		"feed.xml",
		"rss",
		"rss.xml"
	}

	for path in common_paths:
		page,response_code = download (platform, url + '/' + path, args, verbosity, True)
		if response_code == 200:
			# TODO: verify it is a valid RSS feed
			# Some pages serve response 200 for invalid pages

			# assume we found a feed
			return url + '/' + path

	# failed to find
	return None

def extract_from_page (page, verbosity, url, args):

	# Pages often include links like <link rel="alternate" type="application/rss+xml" title="My Blog's Feed" href="https://example.com/feed/" />
	# We want to start with more specific so we can be more confident in the results, but we'll try to figure it out...
	# Sometimes these include multiple entries. We want the first one because it's usually the correct one. Other entries might be comments, etc.
	delimeters = ['"', "'", '']
	feed_types = ["rss", "atom"]
	plus_signs = ['+', "&#43;", "&#x2B;"]
	link_formats = []
	for delimeter in delimeters:
		for feed_type in feed_types:
			for plus_sign in plus_signs:
				format = "type=" + delimeter + "application/" + feed_type + plus_sign + "xml" + delimeter
				before = search (page, '<', format, reverse=True)
				after = search (page, format, '>')

				# if one is not None, we may get the feed
				if before is None and after is None:
					continue

				# let us safely mess with these strings
				if before is None:
					before = ""
				if after is None:
					after = ""

				string_to_search = None
				if "href=" in before:
					string_to_search = before
				elif "href=" in after:
					string_to_search = after
				if not string_to_search is None:
					for delimeter in delimeters:
						result = search (string_to_search, 'href=' + delimeter, delimeter)
						if not result is None:
							return result

	result = try_common_paths (verbosity, url, args)
	if not result is None:
		return result

	debug ("Failed to find from page. Let's try higher-level pages.", verbosity, platform)

	# split into domain and path
	index = url.find("/",url.find("//")+2)
	if (index == -1):
		domain = url
		path = "/"
	else:
		domain = url[:index]
		path = url[index:]

	if path.startswith("/@") or path.startswith("/~"):
		offset = 3
	elif path.startswith("/user/"):
		offset = 7
	elif path.startswith("/users/"):
		offset = 8
	else:
		offset = 1

	# find first slash after offset (if present)
	index = path.find('/',offset)
	if index > -1:
		path = path[:index+1]
	else:
		path = '/'

	# we don't want to infinitely recurse
	if domain + path == url:
		return

	# try with higher level
	page = download (platform, domain + path, args, verbosity)
	notify ("Trying " + domain + path + " with generic extractor...", verbosity, platform)
	return extract_from_page(page, verbosity, domain + path, args)

def extract (url, page=None, network=False, verbosity=3, args={}):
	if network == True:
		if page is None:
			page = download (platform, url, args, verbosity)
		feed = extract_from_page (page, verbosity, url, args)
		if not feed is None:
			if feed.startswith("/"):
				domain = url[:url.find("/",url.find("//")+2)]
				feed = domain + feed
			return feed