rsstube/scripts/extractors/peertube.py

#!/usr/bin/python3

from utils import *
from download_page import download

# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
	platform = platform[:(-3)]

def extract_from_page (page, verbosity, url, args):
	# strip extra arguments at end of URL
	for symbol in ["?", "&", ";"]:
		if symbol in url:
			url = url[:url.index(symbol)]

	# split into domain and path
	index = url.find("/",url.find("//")+2)
	domain = url[:index]
	path = url[index:]

	# get page type
	index = path.find("/",1)
	page_type = path[1:index]

	# get item name
	if page_type == "accounts" or page_type == "video-channels":
		index2 = path.find("/",index+1)
	elif page_type == "videos":
		# assume UUID is last thing in URL after cleaning additional args
		# end index is -1 in case of trailing slash
		name = path[path.rindex("/",0,-1):]
		if path.startswith("/videos/watch/playlist/"):
			notify ("PeerTube playlists don't seem to have API pages", verbosity, platform)
			return
		elif path.startswith("/videos/watch/"):
			# format is like https://example.com/videos/watch/uuid
			index = path.find("/",index+1)
			index2 = path.find("/",index+2)
		else:
			notify ("Unrecognized URL format.", verbosity, platform)
			return
	else:
		notify ("Unrecognized URL format.", verbosity, platform)
		return None
	if index2 < 0:
	        name = path[index+1:]
	else:
	        name = path[index+1:index2]

	# account on other instance
	if '@' in name:
		# TODO: how do we handle protocol (http vs. https)?
		# for now, assume it's the same as url, or https if not specified
		if "//" in domain:
			protocol = domain[:domain.index("//")+2]
		else:
			debug ("Assuming HTTPS", verbosity, platform)
			protocol = "https://"
		index = name.index('@')
		domain = protocol + name[index+1:]
		name = name[:index]
		debug ("Translating " + url + " into " + domain + "/" + page_type + "/" + name, verbosity, platform)

	# get API page
	api_page_url = domain + "/api/v1/" + page_type + "/" + name
	api_page = download (platform, api_page_url, args, verbosity)

	if page_type == "videos":
		# TODO: This doesn't need two API calls if we just parse the JSON
		# query API for video-channels page

		# search from end, not from beginning, as "/video-channels/" is significant part
		domain = search (api_page, '"url":"', '/video-channels/', reverse=True)
		if not domain is None:
			name = search (api_page, '"url":"' + domain + '/video-channels/', '"')
		if not name is None:
			page_type = "video-channels"
			api_page_url = domain + "/api/v1/" + page_type + "/" + name
			api_page = download (platform, api_page_url, args, verbosity)

	ident = search (api_page, '"id":', ",")

	if not ident is None:
		if page_type == "accounts":
			return domain + "/feeds/videos.xml?accountId=" + ident
		elif page_type == "video-channels":
			return domain + "/feeds/videos.xml?videoChannelId=" + ident

def extract (url, page=None, network=False, verbosity=3, args={}):
	# cannot get feed from URL alone
	if not network:
		return None

	# note: we need the URL for the domain
	feed = extract_from_page (page, verbosity, url, args)
	if not feed is None:
		return feed