100 lines
3.1 KiB
Python
100 lines
3.1 KiB
Python
|
#!/usr/bin/python3
|
||
|
|
||
|
from utils import *
|
||
|
from download_page import download
|
||
|
|
||
|
# portable code to get filename
|
||
|
import os
|
||
|
platform = os.path.basename(__file__)
|
||
|
if platform.endswith(".py"):
|
||
|
platform = platform[:(-3)]
|
||
|
|
||
|
def extract_from_page (page, verbosity, url, args):
|
||
|
# strip extra arguments at end of URL
|
||
|
for symbol in ["?", "&", ";"]:
|
||
|
if symbol in url:
|
||
|
url = url[:url.index(symbol)]
|
||
|
|
||
|
# split into domain and path
|
||
|
index = url.find("/",url.find("//")+2)
|
||
|
domain = url[:index]
|
||
|
path = url[index:]
|
||
|
|
||
|
# get page type
|
||
|
index = path.find("/",1)
|
||
|
page_type = path[1:index]
|
||
|
|
||
|
# get item name
|
||
|
if page_type == "accounts" or page_type == "video-channels":
|
||
|
index2 = path.find("/",index+1)
|
||
|
elif page_type == "videos":
|
||
|
# assume UUID is last thing in URL after cleaning additional args
|
||
|
# end index is -1 in case of trailing slash
|
||
|
name = path[path.rindex("/",0,-1):]
|
||
|
if path.startswith("/videos/watch/playlist/"):
|
||
|
notify ("PeerTube playlists don't seem to have API pages", verbosity, platform)
|
||
|
return
|
||
|
elif path.startswith("/videos/watch/"):
|
||
|
# format is like https://example.com/videos/watch/uuid
|
||
|
index = path.find("/",index+1)
|
||
|
index2 = path.find("/",index+2)
|
||
|
else:
|
||
|
notify ("Unrecognized URL format.", verbosity, platform)
|
||
|
return
|
||
|
else:
|
||
|
notify ("Unrecognized URL format.", verbosity, platform)
|
||
|
return None
|
||
|
if index2 < 0:
|
||
|
name = path[index+1:]
|
||
|
else:
|
||
|
name = path[index+1:index2]
|
||
|
|
||
|
# account on other instance
|
||
|
if '@' in name:
|
||
|
# TODO: how do we handle protocol (http vs. https)?
|
||
|
# for now, assume it's the same as url, or https if not specified
|
||
|
if "//" in domain:
|
||
|
protocol = domain[:domain.index("//")+2]
|
||
|
else:
|
||
|
debug ("Assuming HTTPS", verbosity, platform)
|
||
|
protocol = "https://"
|
||
|
index = name.index('@')
|
||
|
domain = protocol + name[index+1:]
|
||
|
name = name[:index]
|
||
|
debug ("Translating " + url + " into " + domain + "/" + page_type + "/" + name, verbosity, platform)
|
||
|
|
||
|
# get API page
|
||
|
api_page_url = domain + "/api/v1/" + page_type + "/" + name
|
||
|
api_page = download (platform, api_page_url, args, verbosity)
|
||
|
|
||
|
if page_type == "videos":
|
||
|
# TODO: This doesn't need two API calls if we just parse the JSON
|
||
|
# query API for video-channels page
|
||
|
|
||
|
# search from end, not from beginning, as "/video-channels/" is significant part
|
||
|
domain = search (api_page, '"url":"', '/video-channels/', reverse=True)
|
||
|
if not domain is None:
|
||
|
name = search (api_page, '"url":"' + domain + '/video-channels/', '"')
|
||
|
if not name is None:
|
||
|
page_type = "video-channels"
|
||
|
api_page_url = domain + "/api/v1/" + page_type + "/" + name
|
||
|
api_page = download (platform, api_page_url, args, verbosity)
|
||
|
|
||
|
ident = search (api_page, '"id":', ",")
|
||
|
|
||
|
if not ident is None:
|
||
|
if page_type == "accounts":
|
||
|
return domain + "/feeds/videos.xml?accountId=" + ident
|
||
|
elif page_type == "video-channels":
|
||
|
return domain + "/feeds/videos.xml?videoChannelId=" + ident
|
||
|
|
||
|
def extract (url, page=None, network=False, verbosity=3, args={}):
|
||
|
# cannot get feed from URL alone
|
||
|
if not network:
|
||
|
return None
|
||
|
|
||
|
# note: we need the URL for the domain
|
||
|
feed = extract_from_page (page, verbosity, url, args)
|
||
|
if not feed is None:
|
||
|
return feed
|