[youtube] Cleanup authentication code (#786)

Authored by: coletdjnz
This commit is contained in:
coletdjnz 2021-09-24 00:52:17 +00:00 committed by GitHub
parent 51ff9ca0b0
commit 99e9e001de
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 70 deletions

View file

@ -183,7 +183,7 @@ def download_and_parse_fragment(url, frag_index, request_data=None, headers=None
request_data['currentPlayerState'] = {'playerOffsetMs': str(max(offset - 5000, 0))} request_data['currentPlayerState'] = {'playerOffsetMs': str(max(offset - 5000, 0))}
if click_tracking_params: if click_tracking_params:
request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params} request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params}
headers = ie.generate_api_headers(ytcfg, visitor_data=visitor_data) headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
headers.update({'content-type': 'application/json'}) headers.update({'content-type': 'application/json'})
fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n' fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n'
success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( success, continuation_id, offset, click_tracking_params = download_and_parse_fragment(

View file

@ -508,13 +508,6 @@ def _extract_client_name(self, ytcfg, default_client='web'):
ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'],
lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client) lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client)
@staticmethod
def _extract_session_index(*data):
for ytcfg in data:
session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
if session_index is not None:
return session_index
def _extract_client_version(self, ytcfg, default_client='web'): def _extract_client_version(self, ytcfg, default_client='web'):
return self._ytcfg_get_safe( return self._ytcfg_get_safe(
ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'],
@ -593,17 +586,27 @@ def extract_yt_initial_data(self, video_id, webpage):
self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
video_id) video_id)
def _extract_identity_token(self, webpage, item_id): @staticmethod
if not webpage: def _extract_session_index(*data):
return None """
ytcfg = self.extract_ytcfg(item_id, webpage) Index of current account in account list.
See: https://github.com/yt-dlp/yt-dlp/pull/519
"""
for ytcfg in data:
session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX']))
if session_index is not None:
return session_index
# Deprecated?
def _extract_identity_token(self, ytcfg=None, webpage=None):
if ytcfg: if ytcfg:
token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
if token: if token:
return token return token
if webpage:
return self._search_regex( return self._search_regex(
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
'identity token', default=None) 'identity token', default=None, fatal=False)
@staticmethod @staticmethod
def _extract_account_syncid(*args): def _extract_account_syncid(*args):
@ -624,6 +627,10 @@ def _extract_account_syncid(*args):
# and just "user_syncid||" for primary channel. We only want the channel_syncid # and just "user_syncid||" for primary channel. We only want the channel_syncid
return sync_ids[0] return sync_ids[0]
@property
def is_authenticated(self):
return bool(self._generate_sapisidhash_header())
def extract_ytcfg(self, video_id, webpage): def extract_ytcfg(self, video_id, webpage):
if not webpage: if not webpage:
return {} return {}
@ -633,33 +640,30 @@ def extract_ytcfg(self, video_id, webpage):
default='{}'), video_id, fatal=False) or {} default='{}'), video_id, fatal=False) or {}
def generate_api_headers( def generate_api_headers(
self, ytcfg=None, identity_token=None, account_syncid=None, self, *, ytcfg=None, account_syncid=None, session_index=None,
visitor_data=None, api_hostname=None, default_client='web', session_index=None): visitor_data=None, identity_token=None, api_hostname=None, default_client='web'):
origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client)) origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client))
headers = { headers = {
'X-YouTube-Client-Name': compat_str( 'X-YouTube-Client-Name': compat_str(
self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)),
'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client),
'Origin': origin 'Origin': origin,
} 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg),
if not visitor_data and ytcfg: 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg),
visitor_data = try_get( 'X-Goog-Visitor-Id': visitor_data or try_get(
self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str) self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str)
if identity_token: }
headers['X-Youtube-Identity-Token'] = identity_token if session_index is None:
if account_syncid:
headers['X-Goog-PageId'] = account_syncid
if session_index is None and ytcfg:
session_index = self._extract_session_index(ytcfg) session_index = self._extract_session_index(ytcfg)
if account_syncid or session_index is not None: if account_syncid or session_index is not None:
headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0
if visitor_data:
headers['X-Goog-Visitor-Id'] = visitor_data
auth = self._generate_sapisidhash_header(origin) auth = self._generate_sapisidhash_header(origin)
if auth is not None: if auth is not None:
headers['Authorization'] = auth headers['Authorization'] = auth
headers['X-Origin'] = origin headers['X-Origin'] = origin
return headers return {h: v for h, v in headers.items() if v is not None}
@staticmethod @staticmethod
def _build_api_continuation_query(continuation, ctp=None): def _build_api_continuation_query(continuation, ctp=None):
@ -2224,8 +2228,7 @@ def _extract_comment(self, comment_renderer, parent=None):
'parent': parent or 'root' 'parent': parent or 'root'
} }
def _comment_entries(self, root_continuation_data, identity_token, account_syncid, def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
ytcfg, video_id, parent=None, comment_counts=None):
def extract_header(contents): def extract_header(contents):
_total_comments = 0 _total_comments = 0
@ -2283,8 +2286,8 @@ def extract_thread(contents):
if comment_replies_renderer: if comment_replies_renderer:
comment_counts[2] += 1 comment_counts[2] += 1
comment_entries_iter = self._comment_entries( comment_entries_iter = self._comment_entries(
comment_replies_renderer, identity_token, account_syncid, ytcfg, comment_replies_renderer, ytcfg, video_id,
video_id, parent=comment.get('id'), comment_counts=comment_counts) parent=comment.get('id'), comment_counts=comment_counts)
for reply_comment in comment_entries_iter: for reply_comment in comment_entries_iter:
yield reply_comment yield reply_comment
@ -2309,7 +2312,7 @@ def extract_thread(contents):
for page_num in itertools.count(0): for page_num in itertools.count(0):
if not continuation: if not continuation:
break break
headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data) headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
if page_num == 0: if page_num == 0:
if is_first_continuation: if is_first_continuation:
@ -2409,18 +2412,10 @@ def _generate_comment_continuation(video_id):
def _extract_comments(self, ytcfg, video_id, contents, webpage): def _extract_comments(self, ytcfg, video_id, contents, webpage):
"""Entry for comment extraction""" """Entry for comment extraction"""
def _real_comment_extract(contents): def _real_comment_extract(contents):
if isinstance(contents, list):
for entry in contents:
for key, renderer in entry.items():
if key not in known_entry_comment_renderers:
continue
yield from self._comment_entries( yield from self._comment_entries(
renderer, video_id=video_id, ytcfg=ytcfg, traverse_obj(contents, (..., 'itemSectionRenderer'), get_all=False), ytcfg, video_id)
identity_token=self._extract_identity_token(webpage, item_id=video_id),
account_syncid=self._extract_account_syncid(ytcfg))
break
comments = [] comments = []
known_entry_comment_renderers = ('itemSectionRenderer',)
estimated_total = 0 estimated_total = 0
max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf') max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) or float('inf')
# Force English regardless of account setting to prevent parsing issues # Force English regardless of account setting to prevent parsing issues
@ -2445,7 +2440,11 @@ def _real_comment_extract(contents):
} }
@staticmethod @staticmethod
def _generate_player_context(sts=None): def _get_checkok_params():
return {'contentCheckOk': True, 'racyCheckOk': True}
@classmethod
def _generate_player_context(cls, sts=None):
context = { context = {
'html5Preference': 'HTML5_PREF_WANTS', 'html5Preference': 'HTML5_PREF_WANTS',
} }
@ -2455,8 +2454,7 @@ def _generate_player_context(sts=None):
'playbackContext': { 'playbackContext': {
'contentPlaybackContext': context 'contentPlaybackContext': context
}, },
'contentCheckOk': True, **cls._get_checkok_params()
'racyCheckOk': True
} }
@staticmethod @staticmethod
@ -2475,14 +2473,13 @@ def _is_agegated(player_response):
def _is_unplayable(player_response): def _is_unplayable(player_response):
return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr): def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr):
session_index = self._extract_session_index(player_ytcfg, master_ytcfg) session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
headers = self.generate_api_headers( headers = self.generate_api_headers(
player_ytcfg, identity_token, syncid, ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client)
default_client=client, session_index=session_index)
yt_query = {'videoId': video_id} yt_query = {'videoId': video_id}
yt_query.update(self._generate_player_context(sts)) yt_query.update(self._generate_player_context(sts))
@ -2524,7 +2521,7 @@ def _extract_player_ytcfg(self, client, video_id):
webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config') webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config')
return self.extract_ytcfg(video_id, webpage) or {} return self.extract_ytcfg(video_id, webpage) or {}
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, identity_token): def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg):
initial_pr = None initial_pr = None
if webpage: if webpage:
initial_pr = self._extract_yt_initial_variable( initial_pr = self._extract_yt_initial_variable(
@ -2569,7 +2566,7 @@ def append_client(client_name):
try: try:
pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url if require_js_player else None, initial_pr) client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr)
except ExtractorError as e: except ExtractorError as e:
if last_error: if last_error:
self.report_warning(last_error) self.report_warning(last_error)
@ -2580,7 +2577,7 @@ def append_client(client_name):
prs.append(pr) prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header(): if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated:
append_client(client.replace('_agegate', '_creator')) append_client(client.replace('_agegate', '_creator'))
elif self._is_agegated(pr): elif self._is_agegated(pr):
append_client(f'{client}_agegate') append_client(f'{client}_agegate')
@ -2742,11 +2739,10 @@ def _real_extract(self, url):
webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
identity_token = self._extract_identity_token(webpage, video_id)
player_responses, player_url = self._extract_player_responses( player_responses, player_url = self._extract_player_responses(
self._get_requested_clients(url, smuggled_data), self._get_requested_clients(url, smuggled_data),
video_id, webpage, master_ytcfg, identity_token) video_id, webpage, master_ytcfg)
get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
@ -3059,13 +3055,12 @@ def process_language(container, base_url, lang_code, sub_name, query):
webpage, self._YT_INITIAL_DATA_RE, video_id, webpage, self._YT_INITIAL_DATA_RE, video_id,
'yt initial data') 'yt initial data')
if not initial_data: if not initial_data:
headers = self.generate_api_headers( query = {'videoId': video_id}
master_ytcfg, identity_token, self._extract_account_syncid(master_ytcfg), query.update(self._get_checkok_params())
session_index=self._extract_session_index(master_ytcfg))
initial_data = self._extract_response( initial_data = self._extract_response(
item_id=video_id, ep='next', fatal=False, item_id=video_id, ep='next', fatal=False,
ytcfg=master_ytcfg, headers=headers, query={'videoId': video_id}, ytcfg=master_ytcfg, query=query,
headers=self.generate_api_headers(ytcfg=master_ytcfg),
note='Downloading initial data API JSON') note='Downloading initial data API JSON')
try: try:
@ -3837,7 +3832,7 @@ def _rich_grid_entries(self, contents):
if entry: if entry:
yield entry yield entry
''' '''
def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg): def _entries(self, tab, item_id, account_syncid, ytcfg):
def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
@ -3894,7 +3889,8 @@ def extract_entries(parent_renderer): # this needs to called again for continua
for page_num in itertools.count(1): for page_num in itertools.count(1):
if not continuation: if not continuation:
break break
headers = self.generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data) headers = self.generate_api_headers(
ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data)
response = self._extract_response( response = self._extract_response(
item_id='%s page %s' % (item_id, page_num), item_id='%s page %s' % (item_id, page_num),
query=continuation, headers=headers, ytcfg=ytcfg, query=continuation, headers=headers, ytcfg=ytcfg,
@ -4048,7 +4044,6 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs):
return self.playlist_result( return self.playlist_result(
self._entries( self._entries(
selected_tab, playlist_id, selected_tab, playlist_id,
self._extract_identity_token(webpage, item_id),
self._extract_account_syncid(ytcfg, data), ytcfg), self._extract_account_syncid(ytcfg, data), ytcfg),
**metadata) **metadata)
@ -4056,8 +4051,7 @@ def _extract_mix_playlist(self, playlist, playlist_id, data, webpage):
first_id = last_id = None first_id = last_id = None
ytcfg = self.extract_ytcfg(playlist_id, webpage) ytcfg = self.extract_ytcfg(playlist_id, webpage)
headers = self.generate_api_headers( headers = self.generate_api_headers(
ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data))
identity_token=self._extract_identity_token(webpage, item_id=playlist_id))
for page_num in itertools.count(1): for page_num in itertools.count(1):
videos = list(self._playlist_entries(playlist)) videos = list(self._playlist_entries(playlist))
if not videos: if not videos:
@ -4173,10 +4167,8 @@ def _reload_with_unavailable_videos(self, item_id, data, webpage):
ytcfg = self.extract_ytcfg(item_id, webpage) ytcfg = self.extract_ytcfg(item_id, webpage)
headers = self.generate_api_headers( headers = self.generate_api_headers(
ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
identity_token=self._extract_identity_token(webpage, item_id=item_id), visitor_data=try_get(self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
visitor_data=try_get(
self._extract_context(ytcfg), lambda x: x['client']['visitorData'], compat_str))
query = { query = {
'params': params or 'wgYCCAA=', 'params': params or 'wgYCCAA=',
'browseId': browse_id or 'VL%s' % item_id 'browseId': browse_id or 'VL%s' % item_id