rsstube/scripts/extractors/peertube.py

100 lines
3.1 KiB
Python

#!/usr/bin/python3
from utils import *
from download_page import download
# portable code to get filename
import os
platform = os.path.basename(__file__)
if platform.endswith(".py"):
platform = platform[:(-3)]
def extract_from_page (page, verbosity, url, args):
# strip extra arguments at end of URL
for symbol in ["?", "&", ";"]:
if symbol in url:
url = url[:url.index(symbol)]
# split into domain and path
index = url.find("/",url.find("//")+2)
domain = url[:index]
path = url[index:]
# get page type
index = path.find("/",1)
page_type = path[1:index]
# get item name
if page_type == "accounts" or page_type == "video-channels":
index2 = path.find("/",index+1)
elif page_type == "videos":
# assume UUID is last thing in URL after cleaning additional args
# end index is -1 in case of trailing slash
name = path[path.rindex("/",0,-1):]
if path.startswith("/videos/watch/playlist/"):
notify ("PeerTube playlists don't seem to have API pages", verbosity, platform)
return
elif path.startswith("/videos/watch/"):
# format is like https://example.com/videos/watch/uuid
index = path.find("/",index+1)
index2 = path.find("/",index+2)
else:
notify ("Unrecognized URL format.", verbosity, platform)
return
else:
notify ("Unrecognized URL format.", verbosity, platform)
return None
if index2 < 0:
name = path[index+1:]
else:
name = path[index+1:index2]
# account on other instance
if '@' in name:
# TODO: how do we handle protocol (http vs. https)?
# for now, assume it's the same as url, or https if not specified
if "//" in domain:
protocol = domain[:domain.index("//")+2]
else:
debug ("Assuming HTTPS", verbosity, platform)
protocol = "https://"
index = name.index('@')
domain = protocol + name[index+1:]
name = name[:index]
debug ("Translating " + url + " into " + domain + "/" + page_type + "/" + name, verbosity, platform)
# get API page
api_page_url = domain + "/api/v1/" + page_type + "/" + name
api_page = download (platform, api_page_url, args, verbosity)
if page_type == "videos":
# TODO: This doesn't need two API calls if we just parse the JSON
# query API for video-channels page
# search from end, not from beginning, as "/video-channels/" is significant part
domain = search (api_page, '"url":"', '/video-channels/', reverse=True)
if not domain is None:
name = search (api_page, '"url":"' + domain + '/video-channels/', '"')
if not name is None:
page_type = "video-channels"
api_page_url = domain + "/api/v1/" + page_type + "/" + name
api_page = download (platform, api_page_url, args, verbosity)
ident = search (api_page, '"id":', ",")
if not ident is None:
if page_type == "accounts":
return domain + "/feeds/videos.xml?accountId=" + ident
elif page_type == "video-channels":
return domain + "/feeds/videos.xml?videoChannelId=" + ident
def extract (url, page=None, network=False, verbosity=3, args={}):
# cannot get feed from URL alone
if not network:
return None
# note: we need the URL for the domain
feed = extract_from_page (page, verbosity, url, args)
if not feed is None:
return feed