mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-01-23 01:06:41 +00:00
[YouTube] Use new browse API for continuation page extraction. (#131)
Known issues (these issues existed in previous API as well) * Mix playlists only give 1 page (25 vids) * Trending only gives 1 video * History gives 5 pages (200 vids) Co-authored-by: colethedj, pukkandan
This commit is contained in:
parent
c705177da2
commit
a5c5623470
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import os.path
|
import os.path
|
||||||
|
@ -274,7 +275,7 @@ def _real_initialize(self):
|
||||||
'context': {
|
'context': {
|
||||||
'client': {
|
'client': {
|
||||||
'clientName': 'WEB',
|
'clientName': 'WEB',
|
||||||
'clientVersion': '2.20201021.03.00',
|
'clientVersion': '2.20210301.08.00',
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -283,15 +284,28 @@ def _real_initialize(self):
|
||||||
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
||||||
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
|
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
|
||||||
|
|
||||||
def _call_api(self, ep, query, video_id, fatal=True):
|
def _generate_sapisidhash_header(self):
|
||||||
|
sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
|
||||||
|
if sapisid_cookie is None:
|
||||||
|
return
|
||||||
|
time_now = round(time.time())
|
||||||
|
sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
|
||||||
|
return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
|
||||||
|
|
||||||
|
def _call_api(self, ep, query, video_id, fatal=True, headers=None,
|
||||||
|
note='Downloading API JSON', errnote='Unable to download API page'):
|
||||||
data = self._DEFAULT_API_DATA.copy()
|
data = self._DEFAULT_API_DATA.copy()
|
||||||
data.update(query)
|
data.update(query)
|
||||||
|
headers = headers or {}
|
||||||
|
headers.update({'content-type': 'application/json'})
|
||||||
|
auth = self._generate_sapisidhash_header()
|
||||||
|
if auth is not None:
|
||||||
|
headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
|
||||||
|
|
||||||
return self._download_json(
|
return self._download_json(
|
||||||
'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
|
'https://www.youtube.com/youtubei/v1/%s' % ep,
|
||||||
note='Downloading API JSON', errnote='Unable to download API page',
|
video_id=video_id, fatal=fatal, note=note, errnote=errnote,
|
||||||
data=json.dumps(data).encode('utf8'), fatal=fatal,
|
data=json.dumps(data).encode('utf8'), headers=headers,
|
||||||
headers={'content-type': 'application/json'},
|
|
||||||
query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
|
query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
|
||||||
|
|
||||||
def _extract_yt_initial_data(self, video_id, webpage):
|
def _extract_yt_initial_data(self, video_id, webpage):
|
||||||
|
@ -2699,7 +2713,7 @@ def _extract_continuation(cls, renderer):
|
||||||
ctp = continuation_ep.get('clickTrackingParams')
|
ctp = continuation_ep.get('clickTrackingParams')
|
||||||
return YoutubeTabIE._build_continuation_query(continuation, ctp)
|
return YoutubeTabIE._build_continuation_query(continuation, ctp)
|
||||||
|
|
||||||
def _entries(self, tab, identity_token):
|
def _entries(self, tab, identity_token, item_id):
|
||||||
|
|
||||||
def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
|
def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
|
||||||
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
||||||
|
@ -2770,11 +2784,14 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
if last_error:
|
if last_error:
|
||||||
self.report_warning('%s. Retrying ...' % last_error)
|
self.report_warning('%s. Retrying ...' % last_error)
|
||||||
try:
|
try:
|
||||||
browse = self._download_json(
|
response = self._call_api(
|
||||||
'https://www.youtube.com/browse_ajax', None,
|
ep="browse", fatal=True, headers=headers,
|
||||||
'Downloading page %d%s'
|
video_id='%s page %s' % (item_id, page_num),
|
||||||
% (page_num, ' (retry #%d)' % count if count else ''),
|
query={
|
||||||
headers=headers, query=continuation)
|
'continuation': continuation['continuation'],
|
||||||
|
'clickTracking': {'clickTrackingParams': continuation['itct']},
|
||||||
|
},
|
||||||
|
note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
|
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
|
||||||
# Downloading page may result in intermittent 5xx HTTP error
|
# Downloading page may result in intermittent 5xx HTTP error
|
||||||
|
@ -2784,8 +2801,6 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
continue
|
continue
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
response = try_get(browse, lambda x: x[1]['response'], dict)
|
|
||||||
|
|
||||||
# Youtube sometimes sends incomplete data
|
# Youtube sometimes sends incomplete data
|
||||||
# See: https://github.com/ytdl-org/youtube-dl/issues/28194
|
# See: https://github.com/ytdl-org/youtube-dl/issues/28194
|
||||||
if response.get('continuationContents') or response.get('onResponseReceivedActions'):
|
if response.get('continuationContents') or response.get('onResponseReceivedActions'):
|
||||||
|
@ -2793,7 +2808,8 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
last_error = 'Incomplete data recieved'
|
last_error = 'Incomplete data recieved'
|
||||||
if count >= retries:
|
if count >= retries:
|
||||||
self._downloader.report_error(last_error)
|
self._downloader.report_error(last_error)
|
||||||
if not browse or not response:
|
|
||||||
|
if not response:
|
||||||
break
|
break
|
||||||
|
|
||||||
known_continuation_renderers = {
|
known_continuation_renderers = {
|
||||||
|
@ -2936,7 +2952,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
|
||||||
'channel_id': metadata['uploader_id'],
|
'channel_id': metadata['uploader_id'],
|
||||||
'channel_url': metadata['uploader_url']})
|
'channel_url': metadata['uploader_url']})
|
||||||
return self.playlist_result(
|
return self.playlist_result(
|
||||||
self._entries(selected_tab, identity_token),
|
self._entries(selected_tab, identity_token, playlist_id),
|
||||||
**metadata)
|
**metadata)
|
||||||
|
|
||||||
def _extract_from_playlist(self, item_id, url, data, playlist):
|
def _extract_from_playlist(self, item_id, url, data, playlist):
|
||||||
|
@ -3223,26 +3239,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
|
||||||
_TESTS = []
|
_TESTS = []
|
||||||
|
|
||||||
def _entries(self, query, n):
|
def _entries(self, query, n):
|
||||||
data = {
|
data = {'query': query}
|
||||||
'context': {
|
|
||||||
'client': {
|
|
||||||
'clientName': 'WEB',
|
|
||||||
'clientVersion': '2.20201021.03.00',
|
|
||||||
}
|
|
||||||
},
|
|
||||||
'query': query,
|
|
||||||
}
|
|
||||||
if self._SEARCH_PARAMS:
|
if self._SEARCH_PARAMS:
|
||||||
data['params'] = self._SEARCH_PARAMS
|
data['params'] = self._SEARCH_PARAMS
|
||||||
total = 0
|
total = 0
|
||||||
for page_num in itertools.count(1):
|
for page_num in itertools.count(1):
|
||||||
search = self._download_json(
|
search = self._call_api(
|
||||||
'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
|
ep='search', video_id='query "%s"' % query, fatal=False,
|
||||||
video_id='query "%s"' % query,
|
note='Downloading page %s' % page_num, query=data)
|
||||||
note='Downloading page %s' % page_num,
|
|
||||||
errnote='Unable to download API page', fatal=False,
|
|
||||||
data=json.dumps(data).encode('utf8'),
|
|
||||||
headers={'content-type': 'application/json'})
|
|
||||||
if not search:
|
if not search:
|
||||||
break
|
break
|
||||||
slr_contents = try_get(
|
slr_contents = try_get(
|
||||||
|
@ -3394,8 +3398,8 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
|
||||||
|
|
||||||
|
|
||||||
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
|
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
|
||||||
IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
|
IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
|
||||||
_VALID_URL = r':ythistory'
|
_VALID_URL = r':ythis(?:tory)?'
|
||||||
_FEED_NAME = 'history'
|
_FEED_NAME = 'history'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': ':ythistory',
|
'url': ':ythistory',
|
||||||
|
|
Loading…
Reference in a new issue