From d1795f4a6af99c976c9d3ea2dabe5cf4f8965d3c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 8 Jun 2023 13:47:13 -0500 Subject: [PATCH] [extractor/twitter] Add login support (#7258) Closes #6951 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 213 +++++++++++++++++++++++++++++++++--- 1 file changed, 198 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4624ce503..f854d9c4a 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -3,7 +3,6 @@ from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE -from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -30,11 +29,67 @@ class TwitterBaseIE(InfoExtractor): + _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} _guest_token = None + _flow_token = None + + _LOGIN_INIT_DATA = json.dumps({ + 'input_flow_data': { + 'flow_context': { + 'debug_overrides': {}, + 'start_location': { + 'location': 'unknown' + } + } + }, + 'subtask_versions': { + 'action_list': 2, + 'alert_dialog': 1, + 'app_download_cta': 1, + 'check_logged_in_account': 1, + 'choice_selection': 3, + 'contacts_live_sync_permission_prompt': 0, + 'cta': 7, + 'email_verification': 2, + 'end_flow': 1, + 'enter_date': 1, + 'enter_email': 2, + 'enter_password': 5, + 'enter_phone': 2, + 'enter_recaptcha': 1, + 'enter_text': 5, + 'enter_username': 2, + 'generic_urt': 3, + 'in_app_notification': 1, + 'interest_picker': 3, + 'js_instrumentation': 1, + 'menu_dialog': 1, + 'notifications_permission_prompt': 2, + 'open_account': 2, + 'open_home_timeline': 1, + 'open_link': 1, + 'phone_verification': 4, + 'privacy_options': 1, + 'security_key': 3, + 'select_avatar': 4, + 'select_banner': 2, + 'settings_list': 7, + 'show_code': 1, + 'sign_up': 2, + 'sign_up_review': 4, + 'tweet_selection_urt': 1, + 'update_users': 1, + 'upload_media': 1, + 'user_recommendations_list': 4, + 'user_recommendations_urt': 1, + 'wait_spinner': 3, + 'web_modal': 1 + } + }, separators=(',', ':')).encode() def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -86,18 +141,151 @@ def _search_dimensions_in_video_url(a_format, video_url): 'height': int(m.group('height')), }) - @functools.cached_property + @property def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _call_api(self, path, video_id, query={}, graphql=False): - cookies = self._get_cookies(self._API_BASE) + def _fetch_guest_token(self, headers, display_id): + headers.pop('x-guest-token', None) + self._guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, + 'Downloading guest token', data=b'', headers=headers), 'guest_token') + if not self._guest_token: + raise ExtractorError('Could not retrieve guest token') + + def _set_base_headers(self): headers = self._AUTH.copy() + csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) + if csrf_token: + headers['x-csrf-token'] = csrf_token + return headers - csrf_cookie = cookies.get('ct0') - if csrf_cookie: - headers['x-csrf-token'] = csrf_cookie.value + def _call_login_api(self, note, headers, query={}, data=None): + response = self._download_json( + f'{self._API_BASE}onboarding/task.json', None, note, + headers=headers, query=query, data=data, expected_status=400) + error = traverse_obj(response, ('errors', 0, 'message', {str})) + if error: + raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True) + elif traverse_obj(response, 'status') != 'success': + raise ExtractorError('Login was unsuccessful') + subtask = traverse_obj( + response, ('subtasks', ..., 'subtask_id', {str}), get_all=False) + if not subtask: + raise ExtractorError('Twitter API did not return next login subtask') + + self._flow_token = response['flow_token'] + + return subtask + + def _perform_login(self, username, password): + if self.is_logged_in: + return + + self._request_webpage('https://twitter.com/', None, 'Requesting cookies') + headers = self._set_base_headers() + self._fetch_guest_token(headers, None) + headers.update({ + 'content-type': 'application/json', + 'x-guest-token': self._guest_token, + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + 'Referer': 'https://twitter.com/', + 'Origin': 'https://twitter.com', + }) + + def build_login_json(*subtask_inputs): + return json.dumps({ + 'flow_token': self._flow_token, + 'subtask_inputs': subtask_inputs + }, separators=(',', ':')).encode() + + def input_dict(subtask_id, text): + return { + 'subtask_id': subtask_id, + 'enter_text': { + 'text': text, + 'link': 'next_link' + } + } + + next_subtask = self._call_login_api( + 'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA) + + while not self.is_logged_in: + if next_subtask == 'LoginJsInstrumentationSubtask': + next_subtask = self._call_login_api( + 'Submitting JS instrumentation response', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'js_instrumentation': { + 'response': '{}', + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterUserIdentifierSSO': + next_subtask = self._call_login_api( + 'Submitting username', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'settings_list': { + 'setting_responses': [{ + 'key': 'user_identifier', + 'response_data': { + 'text_data': { + 'result': username + } + } + }], + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterAlternateIdentifierSubtask': + next_subtask = self._call_login_api( + 'Submitting alternate identifier', headers, + data=build_login_json(input_dict(next_subtask, self._get_tfa_info( + 'one of username, phone number or email that was not used as --username')))) + + elif next_subtask == 'LoginEnterPassword': + next_subtask = self._call_login_api( + 'Submitting password', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'enter_password': { + 'password': password, + 'link': 'next_link' + } + })) + + elif next_subtask == 'AccountDuplicationCheck': + next_subtask = self._call_login_api( + 'Submitting account duplication check', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'check_logged_in_account': { + 'link': 'AccountDuplicationCheck_false' + } + })) + + elif next_subtask == 'LoginTwoFactorAuthChallenge': + next_subtask = self._call_login_api( + 'Submitting 2FA token', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('two-factor authentication token')))) + + elif next_subtask == 'LoginAcid': + next_subtask = self._call_login_api( + 'Submitting confirmation code', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + + elif next_subtask == 'LoginSuccessSubtask': + raise ExtractorError('Twitter API did not grant auth token cookie') + + else: + raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"') + + self.report_login() + + def _call_api(self, path, video_id, query={}, graphql=False): + headers = self._set_base_headers() if self.is_logged_in: headers.update({ 'x-twitter-auth-type': 'OAuth2Session', @@ -106,15 +294,10 @@ def _call_api(self, path, video_id, query={}, graphql=False): }) for first_attempt in (True, False): - if not self.is_logged_in and not self._guest_token: - headers.pop('x-guest-token', None) - self._guest_token = traverse_obj(self._download_json( - f'{self._API_BASE}guest/activate.json', video_id, - 'Downloading guest token', data=b'', headers=headers), 'guest_token') - if self._guest_token: + if not self.is_logged_in: + if not self._guest_token: + self._fetch_guest_token(headers, video_id) headers['x-guest-token'] = self._guest_token - elif not self.is_logged_in: - raise ExtractorError('Could not retrieve guest token') allowed_status = {400, 401, 403, 404} if graphql else {403} result = self._download_json(