rsstube/scripts/extractors/peertube.py

#!/usr/bin/python3

from utils import *
from download_page import download

# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
	platform = platform[:(-3)]

def extract_from_page (page, verbosity, url, args):
	# strip extra arguments at end of URL
	for symbol in ["?", "&", ";"]:
		if symbol in url:
			url = url[:url.index(symbol)]

	# split into domain and path
	index = url.find("/",url.find("//")+2)
	domain = url[:index]
	path = url[index:]

	# get page type
	index = path.find("/",1)
	page_type = path[1:index]

	# get item name
	if page_type == "accounts" or page_type == "video-channels":
		index2 = path.find("/",index+1)
	elif page_type == "videos":
		# assume UUID is last thing in URL after cleaning additional args
		# end index is -1 in case of trailing slash
		name = path[path.rindex("/",0,-1):]
		if path.startswith("/videos/watch/playlist/"):
			notify ("PeerTube playlists don't seem to have API pages", verbosity, platform)
			return
		elif path.startswith("/videos/watch/"):
			# format is like https://example.com/videos/watch/uuid
			index = path.find("/",index+1)
			index2 = path.find("/",index+2)
		else:
			notify ("Unrecognized URL format.", verbosity, platform)
			return
	else:
		notify ("Unrecognized URL format.", verbosity, platform)
		return None
	if index2 < 0:
	        name = path[index+1:]
	else:
	        name = path[index+1:index2]

	# account on other instance
	if '@' in name:
		# TODO: how do we handle protocol (http vs. https)?
		# for now, assume it's the same as url, or https if not specified
		if "//" in domain:
			protocol = domain[:domain.index("//")+2]
		else:
			debug ("Assuming HTTPS", verbosity, platform)
			protocol = "https://"
		index = name.index('@')
		domain = protocol + name[index+1:]
		name = name[:index]
		debug ("Translating " + url + " into " + domain + "/" + page_type + "/" + name, verbosity, platform)

	# get API page
	api_page_url = domain + "/api/v1/" + page_type + "/" + name
	api_page = download (platform, api_page_url, args, verbosity)

	if page_type == "videos":
		# TODO: This doesn't need two API calls if we just parse the JSON
		# query API for video-channels page

		# search from end, not from beginning, as "/video-channels/" is significant part
		domain = search (api_page, '"url":"', '/video-channels/', reverse=True)
		if not domain is None:
			name = search (api_page, '"url":"' + domain + '/video-channels/', '"')
		if not name is None:
			page_type = "video-channels"
			api_page_url = domain + "/api/v1/" + page_type + "/" + name
			api_page = download (platform, api_page_url, args, verbosity)

	ident = search (api_page, '"id":', ",")

	if not ident is None:
		if page_type == "accounts":
			return domain + "/feeds/videos.xml?accountId=" + ident
		elif page_type == "video-channels":
			return domain + "/feeds/videos.xml?videoChannelId=" + ident

def extract (url, page=None, network=False, verbosity=3, args={}):
	# cannot get feed from URL alone
	if not network:
		return None

	# note: we need the URL for the domain
	feed = extract_from_page (page, verbosity, url, args)
	if not feed is None:
		return feed
Initial code push. This version of rsstube works but is not complete. 2021-07-22 20:00:00 -04:00			`#!/usr/bin/python3`

			`from utils import *`
			`from download_page import download`

			`# portable code to get filename`
			`import os`
			`platform = os.path.basename(__file__)`
			`if platform.endswith(".py"):`
			`platform = platform[:(-3)]`

			`def extract_from_page (page, verbosity, url, args):`
			`# strip extra arguments at end of URL`
			`for symbol in ["?", "&", ";"]:`
			`if symbol in url:`
			`url = url[:url.index(symbol)]`

			`# split into domain and path`
			`index = url.find("/",url.find("//")+2)`
			`domain = url[:index]`
			`path = url[index:]`

			`# get page type`
			`index = path.find("/",1)`
			`page_type = path[1:index]`

			`# get item name`
			`if page_type == "accounts" or page_type == "video-channels":`
			`index2 = path.find("/",index+1)`
			`elif page_type == "videos":`
			`# assume UUID is last thing in URL after cleaning additional args`
			`# end index is -1 in case of trailing slash`
			`name = path[path.rindex("/",0,-1):]`
			`if path.startswith("/videos/watch/playlist/"):`
			`notify ("PeerTube playlists don't seem to have API pages", verbosity, platform)`
			`return`
			`elif path.startswith("/videos/watch/"):`
			`# format is like https://example.com/videos/watch/uuid`
			`index = path.find("/",index+1)`
			`index2 = path.find("/",index+2)`
			`else:`
			`notify ("Unrecognized URL format.", verbosity, platform)`
			`return`
			`else:`
			`notify ("Unrecognized URL format.", verbosity, platform)`
			`return None`
			`if index2 < 0:`
			`name = path[index+1:]`
			`else:`
			`name = path[index+1:index2]`

			`# account on other instance`
			`if '@' in name:`
			`# TODO: how do we handle protocol (http vs. https)?`
			`# for now, assume it's the same as url, or https if not specified`
			`if "//" in domain:`
			`protocol = domain[:domain.index("//")+2]`
			`else:`
			`debug ("Assuming HTTPS", verbosity, platform)`
			`protocol = "https://"`
			`index = name.index('@')`
			`domain = protocol + name[index+1:]`
			`name = name[:index]`
			`debug ("Translating " + url + " into " + domain + "/" + page_type + "/" + name, verbosity, platform)`

			`# get API page`
			`api_page_url = domain + "/api/v1/" + page_type + "/" + name`
			`api_page = download (platform, api_page_url, args, verbosity)`

			`if page_type == "videos":`
			`# TODO: This doesn't need two API calls if we just parse the JSON`
			`# query API for video-channels page`

			`# search from end, not from beginning, as "/video-channels/" is significant part`
			`domain = search (api_page, '"url":"', '/video-channels/', reverse=True)`
			`if not domain is None:`
			`name = search (api_page, '"url":"' + domain + '/video-channels/', '"')`
			`if not name is None:`
			`page_type = "video-channels"`
			`api_page_url = domain + "/api/v1/" + page_type + "/" + name`
			`api_page = download (platform, api_page_url, args, verbosity)`

			`ident = search (api_page, '"id":', ",")`

			`if not ident is None:`
			`if page_type == "accounts":`
			`return domain + "/feeds/videos.xml?accountId=" + ident`
			`elif page_type == "video-channels":`
			`return domain + "/feeds/videos.xml?videoChannelId=" + ident`

			`def extract (url, page=None, network=False, verbosity=3, args={}):`
			`# cannot get feed from URL alone`
			`if not network:`
			`return None`

			`# note: we need the URL for the domain`
			`feed = extract_from_page (page, verbosity, url, args)`
			`if not feed is None:`
			`return feed`