From 3b52a606881e6adadc33444abdeacce562b79330 Mon Sep 17 00:00:00 2001 From: ringus1 Date: Tue, 9 May 2023 01:19:42 +0200 Subject: [PATCH 01/75] [extractor/facebook] Fix metadata extraction (#6856) Closes #3432 Authored by: ringus1 --- yt_dlp/extractor/facebook.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 1404be612e..9d871eb286 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -390,7 +390,10 @@ def extract_metadata(webpage): k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) or {}) page_title = title or self._html_search_regex(( r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', @@ -415,16 +418,17 @@ def extract_metadata(webpage): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None - view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) info_dict = { 'description': description, 'uploader': uploader, 'uploader_id': uploader_data.get('id'), 'timestamp': timestamp, 'thumbnail': thumbnail, - 'view_count': view_count, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) From ef8fb7f029b816dfc95600727d84400591a3b5c5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 8 May 2023 18:45:31 -0500 Subject: [PATCH 02/75] [extractor/wrestleuniverse] Fix extraction, add login (#6982) Closes #6975 Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- README.md | 3 + yt_dlp/extractor/wrestleuniverse.py | 137 +++++++++++++++++++++------- 2 files changed, 105 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index c1f34235db..993ac5a5f6 100644 --- a/README.md +++ b/README.md @@ -1835,6 +1835,9 @@ #### rokfinchannel #### twitter * `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed +### wrestleuniverse +* `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage + **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 5c6dec2c40..946edf20a4 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -2,6 +2,7 @@ import binascii import json import time +import uuid from .common import InfoExtractor from ..dependencies import Cryptodome @@ -12,30 +13,95 @@ traverse_obj, try_call, url_or_none, + urlencode_postdata, ) class WrestleUniverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'wrestleuniverse' _VALID_URL_TMPL = r'https?://(?:www\.)?wrestle-universe\.com/(?:(?P\w{2})/)?%s/(?P\w+)' _API_PATH = None - _TOKEN = None + _REAL_TOKEN = None _TOKEN_EXPIRY = None + _REFRESH_TOKEN = None + _DEVICE_ID = None + _LOGIN_QUERY = {'key': 'AIzaSyCaRPBsDQYVDUWWBXjsTrHESi2r_F3RAdA'} + _LOGIN_HEADERS = { + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web', + 'X-Firebase-gmpid': '1:307308870738:web:820f38fe5150c8976e338b', + 'Referer': 'https://www.wrestle-universe.com/', + 'Origin': 'https://www.wrestle-universe.com', + } - def _get_token_cookie(self): - if not self._TOKEN or not self._TOKEN_EXPIRY: - self._TOKEN = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) - if not self._TOKEN: + @property + def _TOKEN(self): + if not self._REAL_TOKEN or not self._TOKEN_EXPIRY: + token = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) + if not token and not self._REFRESH_TOKEN: self.raise_login_required() - expiry = traverse_obj(jwt_decode_hs256(self._TOKEN), ('exp', {int_or_none})) - if not expiry: - raise ExtractorError('There was a problem with the token cookie') - self._TOKEN_EXPIRY = expiry + self._REAL_TOKEN = token - if self._TOKEN_EXPIRY <= int(time.time()): - raise ExtractorError( - 'Expired token. Refresh your cookies in browser and try again', expected=True) + if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()): + if not self._REFRESH_TOKEN: + raise ExtractorError( + 'Expired token. Refresh your cookies in browser and try again', expected=True) + self._refresh_token() - return self._TOKEN + return self._REAL_TOKEN + + @_TOKEN.setter + def _TOKEN(self, value): + self._REAL_TOKEN = value + + expiry = traverse_obj(value, ({jwt_decode_hs256}, 'exp', {int_or_none})) + if not expiry: + raise ExtractorError('There was a problem with the auth token') + self._TOKEN_EXPIRY = expiry + + def _perform_login(self, username, password): + login = self._download_json( + 'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword', None, + 'Logging in', query=self._LOGIN_QUERY, headers=self._LOGIN_HEADERS, data=json.dumps({ + 'returnSecureToken': True, + 'email': username, + 'password': password, + }, separators=(',', ':')).encode()) + self._REFRESH_TOKEN = traverse_obj(login, ('refreshToken', {str})) + if not self._REFRESH_TOKEN: + self.report_warning('No refresh token was granted') + self._TOKEN = traverse_obj(login, ('idToken', {str})) + + def _real_initialize(self): + if WrestleUniverseBaseIE._DEVICE_ID: + return + + WrestleUniverseBaseIE._DEVICE_ID = self._configuration_arg('device_id', [None], ie_key='WrestleUniverse')[0] + if not WrestleUniverseBaseIE._DEVICE_ID: + WrestleUniverseBaseIE._DEVICE_ID = self.cache.load(self._NETRC_MACHINE, 'device_id') + if WrestleUniverseBaseIE._DEVICE_ID: + return + WrestleUniverseBaseIE._DEVICE_ID = str(uuid.uuid4()) + + self.cache.store(self._NETRC_MACHINE, 'device_id', WrestleUniverseBaseIE._DEVICE_ID) + + def _refresh_token(self): + refresh = self._download_json( + 'https://securetoken.googleapis.com/v1/token', None, 'Refreshing token', + query=self._LOGIN_QUERY, data=urlencode_postdata({ + 'grant_type': 'refresh_token', + 'refresh_token': self._REFRESH_TOKEN, + }), headers={ + **self._LOGIN_HEADERS, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + if traverse_obj(refresh, ('refresh_token', {str})): + self._REFRESH_TOKEN = refresh['refresh_token'] + token = traverse_obj(refresh, 'access_token', 'id_token', expected_type=str) + if not token: + raise ExtractorError('No auth token returned from refresh request') + self._TOKEN = token def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={}, fatal=True): headers = {'CA-CID': ''} @@ -43,7 +109,7 @@ def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={ headers['Content-Type'] = 'application/json;charset=utf-8' data = json.dumps(data, separators=(',', ':')).encode() if auth: - headers['Authorization'] = f'Bearer {self._get_token_cookie()}' + headers['Authorization'] = f'Bearer {self._TOKEN}' return self._download_json( f'https://api.wrestle-universe.com/v1/{self._API_PATH}/{video_id}{param}', video_id, note=f'Downloading {msg} JSON', errnote=f'Failed to download {msg} JSON', @@ -65,7 +131,7 @@ def decrypt(data): token = base64.b64encode(private_key.public_key().export_key('DER')).decode() api_json = self._call_api(video_id, param, msg, data={ - # 'deviceId' (random uuid4 generated at login) is not required yet + 'deviceId': self._DEVICE_ID, 'token': token, **data, }, query=query, fatal=fatal) @@ -105,7 +171,7 @@ class WrestleUniverseVODIE(WrestleUniverseBaseIE): 'upload_date': '20230129', 'thumbnail': 'https://image.asset.wrestle-universe.com/8FjD67P8rZc446RBQs5RBN/8FjD67P8rZc446RBQs5RBN', 'chapters': 'count:7', - 'cast': 'count:18', + 'cast': 'count:21', }, 'params': { 'skip_download': 'm3u8', @@ -169,6 +235,7 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'No longer available', }, { 'note': 'unencrypted HLS', 'url': 'https://www.wrestle-universe.com/en/lives/wUG8hP5iApC63jbtQzhVVx', @@ -196,14 +263,17 @@ def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') metadata = self._download_metadata(url, video_id, lang, 'eventFallbackData') - info = traverse_obj(metadata, { - 'title': ('displayName', {str}), - 'description': ('description', {str}), - 'channel': ('labels', 'group', {str}), - 'location': ('labels', 'venue', {str}), - 'timestamp': ('startTime', {int_or_none}), - 'thumbnails': (('keyVisualUrl', 'alterKeyVisualUrl', 'heroKeyVisualUrl'), {'url': {url_or_none}}), - }) + info = { + 'id': video_id, + **traverse_obj(metadata, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'channel': ('labels', 'group', {str}), + 'location': ('labels', 'venue', {str}), + 'timestamp': ('startTime', {int_or_none}), + 'thumbnails': (('keyVisualUrl', 'alterKeyVisualUrl', 'heroKeyVisualUrl'), {'url': {url_or_none}}), + }), + } ended_time = traverse_obj(metadata, ('endedTime', {int_or_none})) if info.get('timestamp') and ended_time: @@ -211,23 +281,20 @@ def _real_extract(self, url): video_data, decrypt = self._call_encrypted_api( video_id, ':watchArchive', 'watch archive', data={'method': 1}) - formats = self._get_formats(video_data, ( + info['formats'] = self._get_formats(video_data, ( ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) - for f in formats: + for f in info['formats']: # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values if f.get('tbr'): f['tbr'] = int(f['tbr'] / 2.5) hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) - if not hls_aes_key and traverse_obj(video_data, ('hls', 'encryptType', {int}), default=0) > 0: - self.report_warning('HLS AES-128 key was not found in API response') - - return { - 'id': video_id, - 'formats': formats, - 'hls_aes': { + if hls_aes_key: + info['hls_aes'] = { 'key': hls_aes_key, 'iv': traverse_obj(video_data, ('hls', 'iv', {decrypt})), }, - **info, - } + elif traverse_obj(video_data, ('hls', 'encryptType', {int})): + self.report_warning('HLS AES-128 key was not found in API response') + + return info From 21b9413cf7dd4830b2ece57af21589dd4538fc52 Mon Sep 17 00:00:00 2001 From: toomyzoom <52140413+toomyzoom@users.noreply.github.com> Date: Thu, 11 May 2023 02:48:35 -0700 Subject: [PATCH 03/75] [extractor/iwara] Implement login (#6721) Authored by: toomyzoom --- yt_dlp/extractor/iwara.py | 88 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index a5aad26ee8..bdc39a7ddb 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,6 +1,7 @@ import functools import urllib.parse import hashlib +import json from .common import InfoExtractor from ..utils import ( @@ -14,7 +15,49 @@ ) -class IwaraIE(InfoExtractor): +# https://github.com/yt-dlp/yt-dlp/issues/6671 +class IwaraBaseIE(InfoExtractor): + _USERTOKEN = None + _MEDIATOKEN = None + _NETRC_MACHINE = 'iwara' + + def _get_user_token(self, invalidate=False): + if not invalidate and self._USERTOKEN: + return self._USERTOKEN + + username, password = self._get_login_info() + IwaraBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username) + if not IwaraBaseIE._USERTOKEN or invalidate: + IwaraBaseIE._USERTOKEN = self._download_json( + 'https://api.iwara.tv/user/login', None, note='Logging in', + data=json.dumps({ + 'email': username, + 'password': password + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json' + })['token'] + + self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) + + return self._USERTOKEN + + def _get_media_token(self, invalidate=False): + if not invalidate and self._MEDIATOKEN: + return self._MEDIATOKEN + + IwaraBaseIE._MEDIATOKEN = self._download_json( + 'https://api.iwara.tv/user/token', None, note='Fetching media token', + data=b'', # Need to have some data here, even if it's empty + headers={ + 'Authorization': f'Bearer {self._get_user_token()}', + 'Content-Type': 'application/json' + })['accessToken'] + + return self._MEDIATOKEN + + +class IwaraIE(IwaraBaseIE): IE_NAME = 'iwara' _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P[a-zA-Z0-9]+)' _TESTS = [{ @@ -56,6 +99,26 @@ class IwaraIE(InfoExtractor): 'timestamp': 1678732213, 'modified_timestamp': 1679110271, }, + }, { + 'url': 'https://iwara.tv/video/blggmfno8ghl725bg', + 'info_dict': { + 'id': 'blggmfno8ghl725bg', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'お外でおしっこしちゃう猫耳ロリメイド', + 'description': 'md5:0342ba9bf6db09edbbb28729657c3611', + 'uploader': 'Fe_Kurosabi', + 'uploader_id': 'fekurosabi', + 'tags': [ + 'pee' + ], + 'like_count': 192, + 'view_count': 12119, + 'comment_count': 0, + 'timestamp': 1598880567, + 'modified_timestamp': 1598908995, + 'availability': 'needs_auth', + }, }] def _extract_formats(self, video_id, fileurl): @@ -79,12 +142,18 @@ def _extract_formats(self, video_id, fileurl): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True) + username, password = self._get_login_info() + headers = { + 'Authorization': f'Bearer {self._get_media_token()}', + } if username and password else None + video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True, headers=headers) errmsg = video_data.get('message') # at this point we can actually get uploaded user info, but do we need it? if errmsg == 'errors.privateVideo': self.raise_login_required('Private video. Login if you have permissions to watch') - elif errmsg: + elif errmsg == 'errors.notFound' and not username: + self.raise_login_required('Video may need login to view') + elif errmsg: # None if success raise ExtractorError(f'Iwara says: {errmsg}') if not video_data.get('fileUrl'): @@ -112,8 +181,17 @@ def _real_extract(self, url): 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))), } + def _perform_login(self, username, password): + if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token(): + self.write_debug('Skipping logging in') + return -class IwaraUserIE(InfoExtractor): + IwaraBaseIE._USERTOKEN = self._get_user_token(True) + self._get_media_token(True) + self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) + + +class IwaraUserIE(IwaraBaseIE): _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P[^/?#&]+)' IE_NAME = 'iwara:user' _PER_PAGE = 32 @@ -165,7 +243,7 @@ def _real_extract(self, url): playlist_id, traverse_obj(user_info, ('user', 'name'))) -class IwaraPlaylistIE(InfoExtractor): +class IwaraPlaylistIE(IwaraBaseIE): # the ID is an UUID but I don't think it's necessary to write concrete regex _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P[0-9a-f-]+)' IE_NAME = 'iwara:playlist' From c8bc203fbf3bb09914e53f0833eed622ab7edbb9 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 20 May 2023 02:35:08 +0530 Subject: [PATCH 04/75] [docs] Misc improvements Closes #6814, closes #6940, closes #6733, closes #6923, closes #6566, closes #6726, closes #6728 --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 6 ++--- .github/ISSUE_TEMPLATE/4_bug_report.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml | 6 ++--- .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 6 +++++ CONTRIBUTING.md | 4 ++-- Collaborators.md | 8 +++---- README.md | 23 ++++++++++--------- yt_dlp/YoutubeDL.py | 5 ++-- yt_dlp/extractor/unsupported.py | 5 ++-- yt_dlp/options.py | 10 ++++---- 11 files changed, 43 insertions(+), 34 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index cdbb867603..77b777d5a9 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -1,5 +1,5 @@ -name: Broken site -description: Report error in a supported site +name: Broken site support +description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: - type: checkboxes @@ -16,7 +16,7 @@ body: description: | Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: options: - - label: I'm reporting that a **supported** site is broken + - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index bf1d97bbae..122dda4f26 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -1,4 +1,4 @@ -name: Bug report +name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index 1f6f926341..a51db789f3 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -1,5 +1,5 @@ -name: Broken site -description: Report error in a supported site +name: Broken site support +description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: %(no_skip)s @@ -10,7 +10,7 @@ body: description: | Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: options: - - label: I'm reporting that a **supported** site is broken + - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 90f59e70b0..9ab4902673 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -1,4 +1,4 @@ -name: Bug report +name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c4d3e812e2..cbed821734 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -40,4 +40,10 @@ ### What is the purpose of your *pull request*? - [ ] Core bug fix/improvement - [ ] New feature (It is strongly [recommended to open an issue first](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#adding-new-feature-or-making-overarching-changes)) + + +
Copilot Summary + +copilot:all +
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ae2c454239..a8587fe92d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -79,7 +79,7 @@ ### Are you using the latest version? ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2021.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, subcribe to it to be notified when there is any progress. Unless you have something useful to add to the converation, please refrain from commenting. Additionally, it is also helpful to see if the issue has already been documented in the [youtube-dl issue tracker](https://github.com/ytdl-org/youtube-dl/issues). If similar issues have already been reported in youtube-dl (but not in our issue tracker), links to them can be included in your issue report here. @@ -246,7 +246,7 @@ ## yt-dlp coding conventions This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the the extractor will remain broken. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the extractor will remain broken. ### Mandatory and optional metafields diff --git a/Collaborators.md b/Collaborators.md index 71baf5080b..a0976dd8c5 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -8,7 +8,7 @@ # Collaborators ## [pukkandan](https://github.com/pukkandan) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/pukkandan) -[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) * Owner of the fork @@ -26,7 +26,7 @@ ## [shirt](https://github.com/shirt-dev) ## [coletdjnz](https://github.com/coletdjnz) -[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) * Improved plugin architecture * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements @@ -44,7 +44,7 @@ ## [Ashish0804](https://github.com/Ashish0804) [Inactive] * Improved/fixed support for HiDive, HotStar, Hungama, LBRY, LinkedInLearning, Mxplayer, SonyLiv, TV2, Vimeo, VLive etc -## [Lesmiscore](https://github.com/Lesmiscore) (nao20010128nao) +## [Lesmiscore](https://github.com/Lesmiscore) **Bitcoin**: bc1qfd02r007cutfdjwjmyy9w23rjvtls6ncve7r3s **Monacoin**: mona1q3tf7dzvshrhfe3md379xtvt2n22duhglv5dskr @@ -64,7 +64,7 @@ ## [bashonly](https://github.com/bashonly) ## [Grub4K](https://github.com/Grub4K) -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) +[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) * `--update-to`, automated release, nightly builds * Rework internals like `traverse_obj`, various core refactors and bugs fixes diff --git a/README.md b/README.md index 993ac5a5f6..6dff57b4c5 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ # NEW FEATURES * **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. * **YouTube improvements**: - * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, YouTube Music Albums/Channels ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)), and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) + * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) @@ -179,13 +179,13 @@ # INSTALLATION [![All versions](https://img.shields.io/badge/-All_Versions-lightgrey.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases) -You can install yt-dlp using [the binaries](#release-files), [PIP](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions +You can install yt-dlp using [the binaries](#release-files), [pip](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions ## UPDATE You can use `yt-dlp -U` to update if you are using the [release binaries](#release-files) -If you [installed with PIP](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program +If you [installed with pip](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation#third-party-package-managers) or refer their documentation @@ -409,7 +409,8 @@ ## General Options: configuration files --flat-playlist Do not extract the videos of a playlist, only list them - --no-flat-playlist Extract the videos of a playlist + --no-flat-playlist Fully extract the videos of a playlist + (default) --live-from-start Download livestreams from the start. Currently only supported for YouTube (Experimental) @@ -465,9 +466,9 @@ ## Geo-restriction: downloading --xff VALUE How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. One of - "default" (Only when known to be useful), - "never", a two-letter ISO 3166-2 country - code, or an IP block in CIDR notation + "default" (only when known to be useful), + "never", an IP block in CIDR notation, or a + two-letter ISO 3166-2 country code ## Video Selection: -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items @@ -514,7 +515,7 @@ ## Video Selection: dogs" (caseless). Use "--match-filter -" to interactively ask whether to download each video - --no-match-filter Do not use any --match-filter (default) + --no-match-filters Do not use any --match-filter (default) --break-match-filters FILTER Same as "--match-filters" but stops the download process when a video is rejected --no-break-match-filters Do not use any --break-match-filters (default) @@ -1709,7 +1710,7 @@ # MODIFYING METADATA This option also has a few special uses: -* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?Phttps?://www\.vimeo\.com/\d+)` will download the first vimeo video found in the description +* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?Phttps?://www\.vimeo\.com/\d+)"` will download the first vimeo video found in the description * You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file - you can use this to set a different "description" and "synopsis". To modify the metadata of individual streams, use the `meta_` prefix (e.g. `meta1_language`). Any value set to the `meta_` field will overwrite all default values. @@ -1883,7 +1884,7 @@ ## Installing Plugins * **System Plugins** * `/etc/yt-dlp/plugins//yt_dlp_plugins/` * `/etc/yt-dlp-plugins//yt_dlp_plugins/` -2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location: +2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location (recommended for portable installations): * Binary: where `/yt-dlp.exe`, `/yt-dlp-plugins//yt_dlp_plugins/` * Source: where `/yt_dlp/__main__.py`, `/yt-dlp-plugins//yt_dlp_plugins/` @@ -2071,7 +2072,7 @@ #### Use a custom format selector ```python import yt_dlp -URL = ['https://www.youtube.com/watch?v=BaW_jenozKc'] +URLS = ['https://www.youtube.com/watch?v=BaW_jenozKc'] def format_selector(ctx): """ Select the best video and the best audio that won't result in an mkv. diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8ee42b86a6..8f52a71a95 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -190,6 +190,7 @@ class YoutubeDL: ap_username: Multiple-system operator account username. ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. + netrc_location: Location of the netrc file. Defaults to ~/.netrc. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. @@ -3994,7 +3995,7 @@ def _write_subtitles(self, info_dict, filename): # that way it will silently go on when used with unsupporting IE return ret elif not subtitles: - self.to_screen('[info] There\'s no subtitles for the requested languages') + self.to_screen('[info] There are no subtitles for the requested languages') return ret sub_filename_base = self.prepare_filename(info_dict, 'subtitle') if not sub_filename_base: @@ -4048,7 +4049,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] if not thumbnails: - self.to_screen(f'[info] There\'s no {label} thumbnails to download') + self.to_screen(f'[info] There are no {label} thumbnails to download') return ret multiple = write_all and len(thumbnails) > 1 diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index a56bd284f9..1bc49786f9 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -131,8 +131,9 @@ class KnownPiracyIE(UnsupportedInfoExtractor): URLS = ( r'dood\.(?:to|watch|so|pm|wf|re)', # Sites youtube-dl supports, but we won't - r'https://viewsb\.com', - r'https://filemoon\.sx', + r'viewsb\.com', + r'filemoon\.sx', + r'hentai\.animestigma\.com', ) _TESTS = [{ diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 362a648cdd..dc46ce9984 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -411,7 +411,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): general.add_option( '--no-flat-playlist', action='store_false', dest='extract_flat', - help='Extract the videos of a playlist') + help='Fully extract the videos of a playlist (default)') general.add_option( '--live-from-start', action='store_true', dest='live_from_start', @@ -521,11 +521,11 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=optparse.SUPPRESS_HELP) geo.add_option( '--xff', metavar='VALUE', - dest='geo_bypass', default="default", + dest='geo_bypass', default='default', help=( 'How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. ' - 'One of "default" (Only when known to be useful), "never", ' - 'a two-letter ISO 3166-2 country code, or an IP block in CIDR notation')) + 'One of "default" (only when known to be useful), "never", ' + 'an IP block in CIDR notation, or a two-letter ISO 3166-2 country code')) geo.add_option( '--geo-bypass', action='store_const', dest='geo_bypass', const='default', @@ -617,7 +617,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'that contains the phrase "cats & dogs" (caseless). ' 'Use "--match-filter -" to interactively ask whether to download each video')) selection.add_option( - '--no-match-filter', + '--no-match-filters', dest='match_filter', action='store_const', const=None, help='Do not use any --match-filter (default)') selection.add_option( From f7f7a877bf8e87fd4eb0ad2494ad948ca7691114 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 20 May 2023 04:05:22 +0530 Subject: [PATCH 05/75] [extractor/booyah] Remove extractor Site shut down. Closes #6425 --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/booyah.py | 86 --------------------------------- 2 files changed, 87 deletions(-) delete mode 100644 yt_dlp/extractor/booyah.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 974c8a2548..fd2bfa9a10 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -247,7 +247,6 @@ from .bostonglobe import BostonGlobeIE from .box import BoxIE from .boxcast import BoxCastVideoIE -from .booyah import BooyahClipsIE from .bpb import BpbIE from .br import ( BRIE, diff --git a/yt_dlp/extractor/booyah.py b/yt_dlp/extractor/booyah.py deleted file mode 100644 index 5c55f2c765..0000000000 --- a/yt_dlp/extractor/booyah.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, str_or_none, traverse_obj - - -class BooyahBaseIE(InfoExtractor): - _BOOYAH_SESSION_KEY = None - - def _real_initialize(self): - BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( - 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') - - def _get_comments(self, video_id): - comment_json = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, - headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} - - return [{ - 'id': comment.get('comment_id'), - 'author': comment.get('from_nickname'), - 'author_id': comment.get('from_uid'), - 'author_thumbnail': comment.get('from_thumbnail'), - 'text': comment.get('content'), - 'timestamp': comment.get('create_time'), - 'like_count': comment.get('like_cnt'), - } for comment in comment_json.get('comment_list') or ()] - - -class BooyahClipsIE(BooyahBaseIE): - _VALID_URL = r'https?://booyah.live/clips/(?P\d+)' - _TESTS = [{ - 'url': 'https://booyah.live/clips/13887261322952306617', - 'info_dict': { - 'id': '13887261322952306617', - 'ext': 'mp4', - 'view_count': int, - 'duration': 30, - 'channel_id': 90565760, - 'like_count': int, - 'title': 'Cayendo con estilo 😎', - 'uploader': '♡LɪꜱGΛ​MER​', - 'comment_count': int, - 'uploader_id': '90565760', - 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', - 'upload_date': '20220617', - 'timestamp': 1655490556, - 'modified_timestamp': 1655490556, - 'modified_date': '20220617', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, - headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) - - formats = [] - for video_data in json_data['playback']['endpoint_list']: - formats.extend(({ - 'url': video_data.get('stream_url'), - 'ext': 'mp4', - 'height': video_data.get('resolution'), - }, { - 'url': video_data.get('download_url'), - 'ext': 'mp4', - 'format_note': 'Watermarked', - 'height': video_data.get('resolution'), - 'preference': -10, - })) - - return { - 'id': video_id, - 'title': traverse_obj(json_data, ('playback', 'name')), - 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), - 'formats': formats, - 'view_count': traverse_obj(json_data, ('playback', 'views')), - 'like_count': traverse_obj(json_data, ('playback', 'likes')), - 'duration': traverse_obj(json_data, ('playback', 'duration')), - 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), - 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), - 'uploader': traverse_obj(json_data, ('user', 'nickname')), - 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), - 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), - 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), - '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), - } From 1d7656184c6b8aa46b29149893894b3c24f1df00 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 20 May 2023 02:57:59 +0530 Subject: [PATCH 06/75] [jsinterp] Handle `NaN` in bitwise operators Closes #6131 --- test/test_jsinterp.py | 10 ++++++++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 7 ++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 3283657d70..26711502a4 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -445,6 +445,16 @@ def test_bitwise_operators_overflow(self): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_bitwise_operators_typecast(self): + jsi = JSInterpreter('function x(){return null << 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return undefined >> 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return 42 << NaN}') + self.assertEqual(jsi.call_function('x'), 42) + def test_negative(self): jsi = JSInterpreter("function f(){return 2 * -2.0;}") self.assertEqual(jsi.call_function('f'), -4) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index e2b3f0870d..13120d97f8 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -146,6 +146,10 @@ 'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js', 'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw', ), + ( + 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', + 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 5571ecfeb1..965b1c0f29 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -20,7 +20,12 @@ def _js_bit_op(op): def zeroise(x): - return 0 if x in (None, JS_Undefined) else x + if x in (None, JS_Undefined): + return 0 + with contextlib.suppress(TypeError): + if math.isnan(x): # NB: NaN cannot be checked by membership + return 0 + return x def wrapped(a, b): return op(zeroise(a), zeroise(b)) & 0xffffffff From 6f2287cb18cbfb27518f068d868fa9390fee78ad Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 20 May 2023 03:06:23 +0530 Subject: [PATCH 07/75] [cleanup] Misc Closes #7030, closes #6967 --- test/helper.py | 4 +- test/test_YoutubeDL.py | 8 +- test/test_jsinterp.py | 558 +++++++++++++++-------------------------- yt_dlp/YoutubeDL.py | 65 ++--- yt_dlp/jsinterp.py | 2 +- yt_dlp/utils.py | 8 +- 6 files changed, 243 insertions(+), 402 deletions(-) diff --git a/test/helper.py b/test/helper.py index 0b90660ff6..539b2f6189 100644 --- a/test/helper.py +++ b/test/helper.py @@ -194,8 +194,8 @@ def sanitize_got_info_dict(got_dict): 'formats', 'thumbnails', 'subtitles', 'automatic_captions', 'comments', 'entries', # Auto-generated - 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch', - 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url', 'n_entries', + 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch', 'n_entries', + 'fulltitle', 'extractor', 'extractor_key', 'filename', 'filepath', 'infojson_filename', 'original_url', # Only live_status needs to be checked 'is_live', 'was_live', diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3c26bd7c65..477fd220ef 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -757,7 +757,7 @@ def expect_same_infodict(out): test('%(id)r %(height)r', "'1234' 1080") test('%(ext)s-%(ext|def)d', 'mp4-def') test('%(width|0)04d', '0000') - test('a%(width|)d', 'a', outtmpl_na_placeholder='none') + test('a%(width|b)d', 'ab', outtmpl_na_placeholder='none') FORMATS = self.outtmpl_info['formats'] sanitize = lambda x: x.replace(':', ':').replace('"', """).replace('\n', ' ') @@ -871,12 +871,12 @@ def test_postprocessors(self): class SimplePP(PostProcessor): def run(self, info): - with open(audiofile, 'wt') as f: + with open(audiofile, 'w') as f: f.write('EXAMPLE') return [info['filepath']], info def run_pp(params, PP): - with open(filename, 'wt') as f: + with open(filename, 'w') as f: f.write('EXAMPLE') ydl = YoutubeDL(params) ydl.add_post_processor(PP()) @@ -895,7 +895,7 @@ def run_pp(params, PP): class ModifierPP(PostProcessor): def run(self, info): - with open(info['filepath'], 'wt') as f: + with open(info['filepath'], 'w') as f: f.write('MODIFIED') return [], info diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 26711502a4..444909b84b 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -14,462 +14,302 @@ class TestJSInterpreter(unittest.TestCase): + def _test(self, code, ret, func='f', args=()): + self.assertEqual(JSInterpreter(code).call_function(func, *args), ret) + def test_basic(self): - jsi = JSInterpreter('function x(){;}') - self.assertEqual(jsi.call_function('x'), None) - - jsi = JSInterpreter('function x3(){return 42;}') - self.assertEqual(jsi.call_function('x3'), 42) - - jsi = JSInterpreter('function x3(){42}') - self.assertEqual(jsi.call_function('x3'), None) - - jsi = JSInterpreter('var x5 = function(){return 42;}') - self.assertEqual(jsi.call_function('x5'), 42) - - def test_calc(self): - jsi = JSInterpreter('function x4(a){return 2*a+1;}') - self.assertEqual(jsi.call_function('x4', 3), 7) - - def test_empty_return(self): - jsi = JSInterpreter('function f(){return; y()}') + jsi = JSInterpreter('function f(){;}') + self.assertEqual(repr(jsi.extract_function('f')), 'F') self.assertEqual(jsi.call_function('f'), None) - def test_morespace(self): - jsi = JSInterpreter('function x (a) { return 2 * a + 1 ; }') - self.assertEqual(jsi.call_function('x', 3), 7) + self._test('function f(){return 42;}', 42) + self._test('function f(){42}', None) + self._test('var f = function(){return 42;}', 42) - jsi = JSInterpreter('function f () { x = 2 ; return x; }') - self.assertEqual(jsi.call_function('f'), 2) + def test_calc(self): + self._test('function f(a){return 2*a+1;}', 7, args=[3]) + + def test_empty_return(self): + self._test('function f(){return; y()}', None) + + def test_morespace(self): + self._test('function f (a) { return 2 * a + 1 ; }', 7, args=[3]) + self._test('function f () { x = 2 ; return x; }', 2) def test_strange_chars(self): - jsi = JSInterpreter('function $_xY1 ($_axY1) { var $_axY2 = $_axY1 + 1; return $_axY2; }') - self.assertEqual(jsi.call_function('$_xY1', 20), 21) + self._test('function $_xY1 ($_axY1) { var $_axY2 = $_axY1 + 1; return $_axY2; }', + 21, args=[20], func='$_xY1') def test_operators(self): - jsi = JSInterpreter('function f(){return 1 << 5;}') - self.assertEqual(jsi.call_function('f'), 32) - - jsi = JSInterpreter('function f(){return 2 ** 5}') - self.assertEqual(jsi.call_function('f'), 32) - - jsi = JSInterpreter('function f(){return 19 & 21;}') - self.assertEqual(jsi.call_function('f'), 17) - - jsi = JSInterpreter('function f(){return 11 >> 2;}') - self.assertEqual(jsi.call_function('f'), 2) - - jsi = JSInterpreter('function f(){return []? 2+3: 4;}') - self.assertEqual(jsi.call_function('f'), 5) - - jsi = JSInterpreter('function f(){return 1 == 2}') - self.assertEqual(jsi.call_function('f'), False) - - jsi = JSInterpreter('function f(){return 0 && 1 || 2;}') - self.assertEqual(jsi.call_function('f'), 2) - - jsi = JSInterpreter('function f(){return 0 ?? 42;}') - self.assertEqual(jsi.call_function('f'), 0) - - jsi = JSInterpreter('function f(){return "life, the universe and everything" < 42;}') - self.assertFalse(jsi.call_function('f')) + self._test('function f(){return 1 << 5;}', 32) + self._test('function f(){return 2 ** 5}', 32) + self._test('function f(){return 19 & 21;}', 17) + self._test('function f(){return 11 >> 2;}', 2) + self._test('function f(){return []? 2+3: 4;}', 5) + self._test('function f(){return 1 == 2}', False) + self._test('function f(){return 0 && 1 || 2;}', 2) + self._test('function f(){return 0 ?? 42;}', 0) + self._test('function f(){return "life, the universe and everything" < 42;}', False) def test_array_access(self): - jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') - self.assertEqual(jsi.call_function('f'), [5, 2, 7]) + self._test('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}', [5, 2, 7]) def test_parens(self): - jsi = JSInterpreter('function f(){return (1) + (2) * ((( (( (((((3)))))) )) ));}') - self.assertEqual(jsi.call_function('f'), 7) - - jsi = JSInterpreter('function f(){return (1 + 2) * 3;}') - self.assertEqual(jsi.call_function('f'), 9) + self._test('function f(){return (1) + (2) * ((( (( (((((3)))))) )) ));}', 7) + self._test('function f(){return (1 + 2) * 3;}', 9) def test_quotes(self): - jsi = JSInterpreter(R'function f(){return "a\"\\("}') - self.assertEqual(jsi.call_function('f'), R'a"\(') + self._test(R'function f(){return "a\"\\("}', R'a"\(') def test_assignments(self): - jsi = JSInterpreter('function f(){var x = 20; x = 30 + 1; return x;}') - self.assertEqual(jsi.call_function('f'), 31) - - jsi = JSInterpreter('function f(){var x = 20; x += 30 + 1; return x;}') - self.assertEqual(jsi.call_function('f'), 51) - - jsi = JSInterpreter('function f(){var x = 20; x -= 30 + 1; return x;}') - self.assertEqual(jsi.call_function('f'), -11) + self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31) + self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) + self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) def test_comments(self): 'Skipping: Not yet fully implemented' return - jsi = JSInterpreter(''' - function x() { - var x = /* 1 + */ 2; - var y = /* 30 - * 40 */ 50; - return x + y; - } - ''') - self.assertEqual(jsi.call_function('x'), 52) + self._test(''' + function f() { + var x = /* 1 + */ 2; + var y = /* 30 + * 40 */ 50; + return x + y; + } + ''', 52) - jsi = JSInterpreter(''' - function f() { - var x = "/*"; - var y = 1 /* comment */ + 2; - return y; - } - ''') - self.assertEqual(jsi.call_function('f'), 3) + self._test(''' + function f() { + var x = "/*"; + var y = 1 /* comment */ + 2; + return y; + } + ''', 3) def test_precedence(self): - jsi = JSInterpreter(''' - function x() { - var a = [10, 20, 30, 40, 50]; - var b = 6; - a[0]=a[b%a.length]; - return a; - }''') - self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) + self._test(''' + function f() { + var a = [10, 20, 30, 40, 50]; + var b = 6; + a[0]=a[b%a.length]; + return a; + } + ''', [20, 20, 30, 40, 50]) def test_builtins(self): - jsi = JSInterpreter(''' - function x() { return NaN } - ''') - self.assertTrue(math.isnan(jsi.call_function('x'))) + jsi = JSInterpreter('function f() { return NaN }') + self.assertTrue(math.isnan(jsi.call_function('f'))) - jsi = JSInterpreter(''' - function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } - ''') - self.assertEqual(jsi.call_function('x'), 86000) - jsi = JSInterpreter(''' - function x(dt) { return new Date(dt) - 0; } - ''') - self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', + 86000) + self._test('function f(dt) { return new Date(dt) - 0; }', + 86000, args=['Wednesday 31 December 1969 18:01:26 MDT']) def test_call(self): jsi = JSInterpreter(''' - function x() { return 2; } - function y(a) { return x() + (a?a:0); } - function z() { return y(3); } + function x() { return 2; } + function y(a) { return x() + (a?a:0); } + function z() { return y(3); } ''') self.assertEqual(jsi.call_function('z'), 5) self.assertEqual(jsi.call_function('y'), 2) def test_if(self): - jsi = JSInterpreter(''' - function x() { - let a = 9; - if (0==0) {a++} - return a - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + let a = 9; + if (0==0) {a++} + return a + } + ''', 10) - jsi = JSInterpreter(''' - function x() { - if (0==0) {return 10} - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + if (0==0) {return 10} + } + ''', 10) - jsi = JSInterpreter(''' - function x() { - if (0!=0) {return 1} - else {return 10} - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + if (0!=0) {return 1} + else {return 10} + } + ''', 10) """ # Unsupported - jsi = JSInterpreter(''' - function x() { - if (0!=0) {return 1} - else if (1==0) {return 2} - else {return 10} - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + if (0!=0) {return 1} + else if (1==0) {return 2} + else {return 10} + } + ''', 10) """ def test_for_loop(self): - jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) {a++} return a } - ''') - self.assertEqual(jsi.call_function('x'), 10) + self._test('function f() { a=0; for (i=0; i-10; i++) {a++} return a }', 10) def test_switch(self): jsi = JSInterpreter(''' - function x(f) { switch(f){ - case 1:f+=1; - case 2:f+=2; - case 3:f+=3;break; - case 4:f+=4; - default:f=0; - } return f } + function f(x) { switch(x){ + case 1:x+=1; + case 2:x+=2; + case 3:x+=3;break; + case 4:x+=4; + default:x=0; + } return x } ''') - self.assertEqual(jsi.call_function('x', 1), 7) - self.assertEqual(jsi.call_function('x', 3), 6) - self.assertEqual(jsi.call_function('x', 5), 0) + self.assertEqual(jsi.call_function('f', 1), 7) + self.assertEqual(jsi.call_function('f', 3), 6) + self.assertEqual(jsi.call_function('f', 5), 0) def test_switch_default(self): jsi = JSInterpreter(''' - function x(f) { switch(f){ - case 2: f+=2; - default: f-=1; - case 5: - case 6: f+=6; - case 0: break; - case 1: f+=1; - } return f } + function f(x) { switch(x){ + case 2: x+=2; + default: x-=1; + case 5: + case 6: x+=6; + case 0: break; + case 1: x+=1; + } return x } ''') - self.assertEqual(jsi.call_function('x', 1), 2) - self.assertEqual(jsi.call_function('x', 5), 11) - self.assertEqual(jsi.call_function('x', 9), 14) + self.assertEqual(jsi.call_function('f', 1), 2) + self.assertEqual(jsi.call_function('f', 5), 11) + self.assertEqual(jsi.call_function('f', 9), 14) def test_try(self): - jsi = JSInterpreter(''' - function x() { try{return 10} catch(e){return 5} } - ''') - self.assertEqual(jsi.call_function('x'), 10) + self._test('function f() { try{return 10} catch(e){return 5} }', 10) def test_catch(self): - jsi = JSInterpreter(''' - function x() { try{throw 10} catch(e){return 5} } - ''') - self.assertEqual(jsi.call_function('x'), 5) + self._test('function f() { try{throw 10} catch(e){return 5} }', 5) def test_finally(self): - jsi = JSInterpreter(''' - function x() { try{throw 10} finally {return 42} } - ''') - self.assertEqual(jsi.call_function('x'), 42) - jsi = JSInterpreter(''' - function x() { try{throw 10} catch(e){return 5} finally {return 42} } - ''') - self.assertEqual(jsi.call_function('x'), 42) + self._test('function f() { try{throw 10} finally {return 42} }', 42) + self._test('function f() { try{throw 10} catch(e){return 5} finally {return 42} }', 42) def test_nested_try(self): - jsi = JSInterpreter(''' - function x() {try { - try{throw 10} finally {throw 42} - } catch(e){return 5} } - ''') - self.assertEqual(jsi.call_function('x'), 5) + self._test(''' + function f() {try { + try{throw 10} finally {throw 42} + } catch(e){return 5} } + ''', 5) def test_for_loop_continue(self): - jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } - ''') - self.assertEqual(jsi.call_function('x'), 0) + self._test('function f() { a=0; for (i=0; i-10; i++) { continue; a++ } return a }', 0) def test_for_loop_break(self): - jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { break; a++ } return a } - ''') - self.assertEqual(jsi.call_function('x'), 0) + self._test('function f() { a=0; for (i=0; i-10; i++) { break; a++ } return a }', 0) def test_for_loop_try(self): - jsi = JSInterpreter(''' - function x() { - for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; - return 42 } - ''') - self.assertEqual(jsi.call_function('x'), 42) + self._test(''' + function f() { + for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; + return 42 } + ''', 42) def test_literal_list(self): - jsi = JSInterpreter(''' - function x() { return [1, 2, "asdf", [5, 6, 7]][3] } - ''') - self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + self._test('function f() { return [1, 2, "asdf", [5, 6, 7]][3] }', [5, 6, 7]) def test_comma(self): - jsi = JSInterpreter(''' - function x() { a=5; a -= 1, a+=3; return a } - ''') - self.assertEqual(jsi.call_function('x'), 7) - - jsi = JSInterpreter(''' - function x() { a=5; return (a -= 1, a+=3, a); } - ''') - self.assertEqual(jsi.call_function('x'), 7) - - jsi = JSInterpreter(''' - function x() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) } - ''') - self.assertEqual(jsi.call_function('x'), 5) + self._test('function f() { a=5; a -= 1, a+=3; return a }', 7) + self._test('function f() { a=5; return (a -= 1, a+=3, a); }', 7) + self._test('function f() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) }', 5) def test_void(self): - jsi = JSInterpreter(''' - function x() { return void 42; } - ''') - self.assertEqual(jsi.call_function('x'), None) + self._test('function f() { return void 42; }', None) def test_return_function(self): jsi = JSInterpreter(''' - function x() { return [1, function(){return 1}][1] } + function f() { return [1, function(){return 1}][1] } ''') - self.assertEqual(jsi.call_function('x')([]), 1) + self.assertEqual(jsi.call_function('f')([]), 1) def test_null(self): - jsi = JSInterpreter(''' - function x() { return null; } - ''') - self.assertEqual(jsi.call_function('x'), None) - - jsi = JSInterpreter(''' - function x() { return [null > 0, null < 0, null == 0, null === 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, False, False]) - - jsi = JSInterpreter(''' - function x() { return [null >= 0, null <= 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [True, True]) + self._test('function f() { return null; }', None) + self._test('function f() { return [null > 0, null < 0, null == 0, null === 0]; }', + [False, False, False, False]) + self._test('function f() { return [null >= 0, null <= 0]; }', [True, True]) def test_undefined(self): - jsi = JSInterpreter(''' - function x() { return undefined === undefined; } - ''') - self.assertEqual(jsi.call_function('x'), True) + self._test('function f() { return undefined === undefined; }', True) + self._test('function f() { return undefined; }', JS_Undefined) + self._test('function f() {return undefined ?? 42; }', 42) + self._test('function f() { let v; return v; }', JS_Undefined) + self._test('function f() { let v; return v**0; }', 1) + self._test('function f() { let v; return [v>42, v<=42, v&&42, 42&&v]; }', + [False, False, JS_Undefined, JS_Undefined]) + + self._test(''' + function f() { return [ + undefined === undefined, + undefined == undefined, + undefined == null, + undefined < undefined, + undefined > undefined, + undefined === 0, + undefined == 0, + undefined < 0, + undefined > 0, + undefined >= 0, + undefined <= 0, + undefined > null, + undefined < null, + undefined === null + ]; } + ''', list(map(bool, (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))) jsi = JSInterpreter(''' - function x() { return undefined; } + function f() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) - - jsi = JSInterpreter(''' - function x() { let v; return v; } - ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) - - jsi = JSInterpreter(''' - function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } - ''') - self.assertEqual(jsi.call_function('x'), [True, True, False, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined === 0, undefined == 0, undefined < 0, undefined > 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, False, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined >= 0, undefined <= 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined > null, undefined < null, undefined == null, undefined === null]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, True, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined === null, undefined == null, undefined < null, undefined > null]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, True, False, False]) - - jsi = JSInterpreter(''' - function x() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } - ''') - for y in jsi.call_function('x'): + for y in jsi.call_function('f'): self.assertTrue(math.isnan(y)) - jsi = JSInterpreter(''' - function x() { let v; return v**0; } - ''') - self.assertEqual(jsi.call_function('x'), 1) - - jsi = JSInterpreter(''' - function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined]) - - jsi = JSInterpreter('function x(){return undefined ?? 42; }') - self.assertEqual(jsi.call_function('x'), 42) - def test_object(self): - jsi = JSInterpreter(''' - function x() { return {}; } - ''') - self.assertEqual(jsi.call_function('x'), {}) - - jsi = JSInterpreter(''' - function x() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; } - ''') - self.assertEqual(jsi.call_function('x'), [42, 0]) - - jsi = JSInterpreter(''' - function x() { let a; return a?.qq; } - ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) - - jsi = JSInterpreter(''' - function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } - ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) + self._test('function f() { return {}; }', {}) + self._test('function f() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; }', [42, 0]) + self._test('function f() { let a; return a?.qq; }', JS_Undefined) + self._test('function f() { let a = {m1: 42, m2: 0 }; return a?.qq; }', JS_Undefined) def test_regex(self): - jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/; } - ''') - self.assertEqual(jsi.call_function('x'), None) + self._test('function f() { let a=/,,[/,913,/](,)}/; }', None) - jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/; return a; } - ''') - self.assertIsInstance(jsi.call_function('x'), re.Pattern) + jsi = JSInterpreter('function f() { let a=/,,[/,913,/](,)}/; return a; }') + self.assertIsInstance(jsi.call_function('f'), re.Pattern) - jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/i; return a; } - ''') - self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + jsi = JSInterpreter('function f() { let a=/,,[/,913,/](,)}/i; return a; }') + self.assertEqual(jsi.call_function('f').flags & re.I, re.I) - jsi = JSInterpreter(R''' - function x() { let a=/,][}",],()}(\[)/; return a; } - ''') - self.assertEqual(jsi.call_function('x').pattern, r',][}",],()}(\[)') + jsi = JSInterpreter(R'function f() { let a=/,][}",],()}(\[)/; return a; }') + self.assertEqual(jsi.call_function('f').pattern, r',][}",],()}(\[)') - jsi = JSInterpreter(R''' - function x() { let a=[/[)\\]/]; return a[0]; } - ''') - self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + jsi = JSInterpreter(R'function f() { let a=[/[)\\]/]; return a[0]; }') + self.assertEqual(jsi.call_function('f').pattern, r'[)\\]') def test_char_code_at(self): - jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') - self.assertEqual(jsi.call_function('x', 0), 116) - self.assertEqual(jsi.call_function('x', 1), 101) - self.assertEqual(jsi.call_function('x', 2), 115) - self.assertEqual(jsi.call_function('x', 3), 116) - self.assertEqual(jsi.call_function('x', 4), None) - self.assertEqual(jsi.call_function('x', 'not_a_number'), 116) + jsi = JSInterpreter('function f(i){return "test".charCodeAt(i)}') + self.assertEqual(jsi.call_function('f', 0), 116) + self.assertEqual(jsi.call_function('f', 1), 101) + self.assertEqual(jsi.call_function('f', 2), 115) + self.assertEqual(jsi.call_function('f', 3), 116) + self.assertEqual(jsi.call_function('f', 4), None) + self.assertEqual(jsi.call_function('f', 'not_a_number'), 116) def test_bitwise_operators_overflow(self): - jsi = JSInterpreter('function x(){return -524999584 << 5}') - self.assertEqual(jsi.call_function('x'), 379882496) - - jsi = JSInterpreter('function x(){return 1236566549 << 5}') - self.assertEqual(jsi.call_function('x'), 915423904) + self._test('function f(){return -524999584 << 5}', 379882496) + self._test('function f(){return 1236566549 << 5}', 915423904) def test_bitwise_operators_typecast(self): - jsi = JSInterpreter('function x(){return null << 5}') - self.assertEqual(jsi.call_function('x'), 0) - - jsi = JSInterpreter('function x(){return undefined >> 5}') - self.assertEqual(jsi.call_function('x'), 0) - - jsi = JSInterpreter('function x(){return 42 << NaN}') - self.assertEqual(jsi.call_function('x'), 42) + self._test('function f(){return null << 5}', 0) + self._test('function f(){return undefined >> 5}', 0) + self._test('function f(){return 42 << NaN}', 42) def test_negative(self): - jsi = JSInterpreter("function f(){return 2 * -2.0;}") - self.assertEqual(jsi.call_function('f'), -4) - - jsi = JSInterpreter('function f(){return 2 - - -2;}') - self.assertEqual(jsi.call_function('f'), 0) - - jsi = JSInterpreter('function f(){return 2 - - - -2;}') - self.assertEqual(jsi.call_function('f'), 4) - - jsi = JSInterpreter('function f(){return 2 - + + - -2;}') - self.assertEqual(jsi.call_function('f'), 0) - - jsi = JSInterpreter('function f(){return 2 + - + - -2;}') - self.assertEqual(jsi.call_function('f'), 0) + self._test('function f(){return 2 * -2.0 ;}', -4) + self._test('function f(){return 2 - - -2 ;}', 0) + self._test('function f(){return 2 - - - -2 ;}', 4) + self._test('function f(){return 2 - + + - -2;}', 0) + self._test('function f(){return 2 + - + - -2;}', 0) if __name__ == '__main__': diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8f52a71a95..91aec1fe6e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -13,6 +13,7 @@ import random import re import shutil +import string import subprocess import sys import tempfile @@ -21,7 +22,6 @@ import traceback import unicodedata import urllib.request -from string import Formatter, ascii_letters from .cache import Cache from .compat import compat_os_name, compat_shlex_quote @@ -1079,7 +1079,7 @@ def _outtmpl_expandpath(outtmpl): # correspondingly that is not what we want since we need to keep # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. - sep = ''.join(random.choices(ascii_letters, k=32)) + sep = ''.join(random.choices(string.ascii_letters, k=32)) outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution @@ -1238,7 +1238,7 @@ def _dumpjson_default(obj): return list(obj) return repr(obj) - class _ReplacementFormatter(Formatter): + class _ReplacementFormatter(string.Formatter): def get_field(self, field_name, args, kwargs): if field_name.isdigit(): return args[0], -1 @@ -2068,86 +2068,86 @@ def syntax_error(note, start): def _parse_filter(tokens): filter_parts = [] - for type, string, start, _, _ in tokens: - if type == tokenize.OP and string == ']': + for type, string_, start, _, _ in tokens: + if type == tokenize.OP and string_ == ']': return ''.join(filter_parts) else: - filter_parts.append(string) + filter_parts.append(string_) def _remove_unused_ops(tokens): # Remove operators that we don't use and join them with the surrounding strings. # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None - for type, string, start, end, line in tokens: - if type == tokenize.OP and string == '[': + for type, string_, start, end, line in tokens: + if type == tokenize.OP and string_ == '[': if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line # everything inside brackets will be handled by _parse_filter - for type, string, start, end, line in tokens: - yield type, string, start, end, line - if type == tokenize.OP and string == ']': + for type, string_, start, end, line in tokens: + yield type, string_, start, end, line + if type == tokenize.OP and string_ == ']': break - elif type == tokenize.OP and string in ALLOWED_OPS: + elif type == tokenize.OP and string_ in ALLOWED_OPS: if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: if not last_string: - last_string = string + last_string = string_ last_start = start last_end = end else: - last_string += string + last_string += string_ if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None - for type, string, start, _, _ in tokens: + for type, string_, start, _, _ in tokens: # ENCODING is only defined in python 3.x if type == getattr(tokenize, 'ENCODING', None): continue elif type in [tokenize.NAME, tokenize.NUMBER]: - current_selector = FormatSelector(SINGLE, string, []) + current_selector = FormatSelector(SINGLE, string_, []) elif type == tokenize.OP: - if string == ')': + if string_ == ')': if not inside_group: # ')' will be handled by the parentheses group tokens.restore_last_token() break - elif inside_merge and string in ['/', ',']: + elif inside_merge and string_ in ['/', ',']: tokens.restore_last_token() break - elif inside_choice and string == ',': + elif inside_choice and string_ == ',': tokens.restore_last_token() break - elif string == ',': + elif string_ == ',': if not current_selector: raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None - elif string == '/': + elif string_ == '/': if not current_selector: raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) - elif string == '[': + elif string_ == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) - elif string == '(': + elif string_ == '(': if current_selector: raise syntax_error('Unexpected "("', start) group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) - elif string == '+': + elif string_ == '+': if not current_selector: raise syntax_error('Unexpected "+"', start) selector_1 = current_selector @@ -2156,7 +2156,7 @@ def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, ins raise syntax_error('Expected a selector', start) current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) else: - raise syntax_error(f'Operator not recognized: "{string}"', start) + raise syntax_error(f'Operator not recognized: "{string_}"', start) elif type == tokenize.ENDMARKER: break if current_selector: @@ -2898,7 +2898,7 @@ def format_tmpl(tmpl): fmt = '%({})s' if tmpl.startswith('{'): - tmpl = f'.{tmpl}' + tmpl, fmt = f'.{tmpl}', '%({})j' if tmpl.endswith('='): tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) @@ -2937,7 +2937,8 @@ def print_field(field, actual_field=None, optional=False): print_field('url', 'urls') print_field('thumbnail', optional=True) print_field('description', optional=True) - print_field('filename', optional=True) + if filename: + print_field('filename') if self.params.get('forceduration') and info_copy.get('duration') is not None: self.to_stdout(formatSeconds(info_copy['duration'])) print_field('format') @@ -3419,8 +3420,8 @@ def sanitize_info(info_dict, remove_private_keys=False): if remove_private_keys: reject = lambda k, v: v is None or k.startswith('__') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', - 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber', - '_format_sort_fields', + 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url', + 'playlist_autonumber', '_format_sort_fields', } else: reject = lambda k, v: False @@ -3489,7 +3490,7 @@ def run_pp(self, pp, infodict): *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict - def run_all_pps(self, key, info, *, additional_pps=None, fatal=True): + def run_all_pps(self, key, info, *, additional_pps=None): if key != 'video': self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 965b1c0f29..82974fb27b 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -248,7 +248,7 @@ def _separate(expr, delim=',', max_split=None): return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 - in_quote, escaping, after_op, in_regex_char_group, in_unary_op = None, False, True, False, False + in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 47aa75c470..190af1b7d7 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3281,7 +3281,7 @@ def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO def variadic(x, allowed_types=NO_DEFAULT): - return x if is_iterable_like(x, blocked_types=allowed_types) else (x,) + return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) def dict_get(d, key_or_keys, default=None, skip_false_values=True): @@ -5404,7 +5404,7 @@ def to_high_limit_path(path): def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traverse_obj(obj, *variadic(field)) - if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore): + if not val if ignore is NO_DEFAULT else val in variadic(ignore): return default return template % func(val) @@ -5704,8 +5704,8 @@ def traverse_dict(dictn, keys, casesense=True): return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) -def get_first(obj, keys, **kwargs): - return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) +def get_first(obj, *paths, **kwargs): + return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) def time_seconds(**kwargs): From 447afb9eaa65bc677e3245c83e53a8e69c174a3c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 20 May 2023 19:11:03 +1200 Subject: [PATCH 08/75] [extractor/youtube] Support podcasts and releases tabs Closes https://github.com/yt-dlp/yt-dlp/issues/6893 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 48 ++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2b17751e5e..d089822f64 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4639,11 +4639,19 @@ def _playlist_entries(self, video_list_renderer): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( - rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} + rich_grid_renderer, + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} video_id = renderer.get('videoId') - if not video_id: + if video_id: + yield self._extract_video(renderer) + return + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=self._get_text(renderer, 'title')) return - yield self._extract_video(renderer) def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -6185,6 +6193,40 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': '3Blue1Brown', }, 'playlist_count': 0, + }, { + # Podcasts tab, with rich entry playlistRenderers + 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts', + 'info_dict': { + 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast', + 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c', + 'title': '99 Percent Invisible - Podcasts', + 'uploader': '99 Percent Invisible', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'tags': [], + 'channel': '99 Percent Invisible', + 'uploader_id': '@99percentinvisiblepodcast', + }, + 'playlist_count': 1, + }, { + # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + 'url': 'https://www.youtube.com/@AHimitsu/releases', + 'info_dict': { + 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'channel': 'A Himitsu', + 'uploader_url': 'https://www.youtube.com/@AHimitsu', + 'title': 'A Himitsu - Releases', + 'uploader_id': '@AHimitsu', + 'uploader': 'A Himitsu', + 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'tags': 'count:16', + 'description': 'I make music', + 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', + 'channel_follower_count': int, + }, + 'playlist_mincount': 10, }] @classmethod From d2e84d5eb01c66fc5304e8566348d65a7be24ed7 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 3 Apr 2023 07:01:03 +0200 Subject: [PATCH 09/75] [update] Better error handling Authored by: pukkandan --- yt_dlp/__init__.py | 21 +++++++++++++-------- yt_dlp/update.py | 7 ++++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 47ee3cc02f..8806106d31 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -13,6 +13,7 @@ import os import re import sys +import traceback from .compat import compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS @@ -937,14 +938,18 @@ def _real_main(argv=None): if opts.rm_cachedir: ydl.cache.remove() - updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) - if opts.update_self and updater.update() and actual_use: - if updater.cmd: - return updater.restart() - # This code is reachable only for zip variant in py < 3.10 - # It makes sense to exit here, but the old behavior is to continue - ydl.report_warning('Restart yt-dlp to use the updated version') - # return 100, 'ERROR: The program must exit for the update to complete' + try: + updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart yt-dlp to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + except Exception: + traceback.print_exc() + ydl._download_retcode = 100 if not actual_use: if pre_process: diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 5a752d7167..7914de832f 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -16,6 +16,7 @@ Popen, cached_method, deprecation_warning, + network_exceptions, remove_end, remove_start, sanitized_Request, @@ -258,8 +259,8 @@ def check_update(self): self.ydl.to_screen(( f'Available version: {self._label(self.target_channel, self.latest_version)}, ' if self.target_tag == 'latest' else '' ) + f'Current version: {self._label(CHANNEL, self.current_version)}') - except Exception: - return self._report_network_error('obtain version info', delim='; Please try again later or') + except network_exceptions as e: + return self._report_network_error(f'obtain version info ({e})', delim='; Please try again later or') if not is_non_updateable(): self.ydl.to_screen(f'Current Build Hash: {_sha256_file(self.filename)}') @@ -303,7 +304,7 @@ def update(self): try: newcontent = self._download(self.release_name, self._tag) - except Exception as e: + except network_exceptions as e: if isinstance(e, urllib.error.HTTPError) and e.code == 404: return self._report_error( f'The requested tag {self._label(self.target_channel, self.target_tag)} does not exist', True) From 665472a7de3880578c0b7b3f95c71570c056368e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 20 May 2023 21:21:32 +0200 Subject: [PATCH 10/75] [update] Implement `--update-to` repo Authored by: Grub4K, pukkandan --- README.md | 13 ++++++++----- yt_dlp/__init__.py | 2 +- yt_dlp/options.py | 8 ++++---- yt_dlp/update.py | 41 ++++++++++++++++++++++++++++------------- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 6dff57b4c5..d0eaba7477 100644 --- a/README.md +++ b/README.md @@ -196,12 +196,15 @@ ## UPDATE The `nightly` channel has releases built after each push to the master branch, and will have the most recent fixes and additions, but also have more risk of regressions. They are available in [their own repo](https://github.com/yt-dlp/yt-dlp-nightly-builds/releases). When using `--update`/`-U`, a release binary will only update to its current channel. -This release channel can be changed by using the `--update-to` option. `--update-to` can also be used to upgrade or downgrade to specific tags from a channel. +`--update-to CHANNEL` can be used to switch to a different channel when a newer version is available. `--update-to [CHANNEL@]TAG` can also be used to upgrade or downgrade to specific tags from a channel. + +You may also use `--update-to ` (`/`) to update to a channel on a completely different repository. Be careful with what repository you are updating to though, there is no verification done for binaries from different repositories. Example usage: * `yt-dlp --update-to nightly` change to `nightly` channel and update to its latest release * `yt-dlp --update-to stable@2023.02.17` upgrade/downgrade to release to `stable` channel tag `2023.02.17` * `yt-dlp --update-to 2023.01.06` upgrade/downgrade to tag `2023.01.06` if it exists on the current channel +* `yt-dlp --update-to example/yt-dlp@2023.03.01` upgrade/downgrade to the release from the `example/yt-dlp` repository, tag `2023.03.01` ## RELEASE FILES @@ -360,10 +363,10 @@ ## General Options: -U, --update Update this program to the latest version --no-update Do not check for updates (default) --update-to [CHANNEL]@[TAG] Upgrade/downgrade to a specific version. - CHANNEL and TAG defaults to "stable" and - "latest" respectively if omitted; See - "UPDATE" for details. Supported channels: - stable, nightly + CHANNEL can be a repository as well. CHANNEL + and TAG default to "stable" and "latest" + respectively if omitted; See "UPDATE" for + details. Supported channels: stable, nightly -i, --ignore-errors Ignore download and postprocessing errors. The download will be considered successful even if the postprocessing fails diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 8806106d31..9563d784aa 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -939,7 +939,7 @@ def _real_main(argv=None): ydl.cache.remove() try: - updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) + updater = Updater(ydl, opts.update_self) if opts.update_self and updater.update() and actual_use: if updater.cmd: return updater.restart() diff --git a/yt_dlp/options.py b/yt_dlp/options.py index dc46ce9984..838d79fcb1 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -323,7 +323,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Print program version and exit') general.add_option( '-U', '--update', - action='store_true', dest='update_self', + action='store_const', dest='update_self', const=CHANNEL, help=format_field( is_non_updateable(), None, 'Check if updates are available. %s', default=f'Update this program to the latest {CHANNEL} version')) @@ -335,9 +335,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--update-to', action='store', dest='update_self', metavar='[CHANNEL]@[TAG]', help=( - 'Upgrade/downgrade to a specific version. CHANNEL and TAG defaults to ' - f'"{CHANNEL}" and "latest" respectively if omitted; See "UPDATE" for details. ' - f'Supported channels: {", ".join(UPDATE_SOURCES)}')) + 'Upgrade/downgrade to a specific version. CHANNEL can be a repository as well. ' + f'CHANNEL and TAG default to "{CHANNEL.partition("@")[0]}" and "latest" respectively if omitted; ' + f'See "UPDATE" for details. Supported channels: {", ".join(UPDATE_SOURCES)}')) general.add_option( '-i', '--ignore-errors', action='store_true', dest='ignoreerrors', diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 7914de832f..6c9bdaf1c7 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -129,27 +129,36 @@ def __init__(self, ydl, target=None): self.ydl = ydl self.target_channel, sep, self.target_tag = (target or CHANNEL).rpartition('@') - if not sep and self.target_tag in UPDATE_SOURCES: # stable => stable@latest - self.target_channel, self.target_tag = self.target_tag, None + # stable => stable@latest + if not sep and ('/' in self.target_tag or self.target_tag in UPDATE_SOURCES): + self.target_channel = self.target_tag + self.target_tag = None elif not self.target_channel: - self.target_channel = CHANNEL + self.target_channel = CHANNEL.partition('@')[0] if not self.target_tag: - self.target_tag, self._exact = 'latest', False + self.target_tag = 'latest' + self._exact = False elif self.target_tag != 'latest': self.target_tag = f'tags/{self.target_tag}' - @property - def _target_repo(self): - try: - return UPDATE_SOURCES[self.target_channel] - except KeyError: - return self._report_error( - f'Invalid update channel {self.target_channel!r} requested. ' - f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) + if '/' in self.target_channel: + self._target_repo = self.target_channel + if self.target_channel not in (CHANNEL, *UPDATE_SOURCES.values()): + self.ydl.report_warning( + f'You are switching to an {self.ydl._format_err("unofficial", "red")} executable ' + f'from {self.ydl._format_err(self._target_repo, self.ydl.Styles.EMPHASIS)}. ' + f'Run {self.ydl._format_err("at your own risk", "light red")}') + self.restart = self._blocked_restart + else: + self._target_repo = UPDATE_SOURCES.get(self.target_channel) + if not self._target_repo: + self._report_error( + f'Invalid update channel {self.target_channel!r} requested. ' + f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) def _version_compare(self, a, b, channel=CHANNEL): - if channel != self.target_channel: + if self._exact and channel != self.target_channel: return False if _VERSION_RE.fullmatch(f'{a}.{b}'): @@ -372,6 +381,12 @@ def restart(self): _, _, returncode = Popen.run(self.cmd) return returncode + def _blocked_restart(self): + self._report_error( + 'Automatically restarting into custom builds is disabled for security reasons. ' + 'Restart yt-dlp to use the updated version', expected=True) + return self.ydl._download_retcode + def run_update(ydl): """Update the program file with the latest version from the repository From 44a79958f0b596ee71e1eb25f158610aada29d1b Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 3 Apr 2023 07:06:27 +0200 Subject: [PATCH 11/75] [build] Fix macOS target Authored by: Grub4K --- .github/workflows/build.yml | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa11c61941..bec0576d1e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -188,21 +188,23 @@ jobs: steps: - uses: actions/checkout@v3 - # NB: In order to create a universal2 application, the version of python3 in /usr/bin has to be used + # NB: Building universal2 does not work with python from actions/setup-python - name: Install Requirements run: | brew install coreutils - /usr/bin/python3 -m pip install -U --user pip Pyinstaller==5.8 -r requirements.txt + python3 -m pip install -U --user pip setuptools wheel + # We need to ignore wheels otherwise we break universal2 builds + python3 -m pip install -U --user --no-binary :all: Pyinstaller -r requirements.txt - name: Prepare run: | - /usr/bin/python3 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} - /usr/bin/python3 devscripts/make_lazy_extractors.py + python3 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python3 devscripts/make_lazy_extractors.py - name: Build run: | - /usr/bin/python3 pyinst.py --target-architecture universal2 --onedir + python3 pyinst.py --target-architecture universal2 --onedir (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) - /usr/bin/python3 pyinst.py --target-architecture universal2 + python3 pyinst.py --target-architecture universal2 - name: Upload artifacts uses: actions/upload-artifact@v3 @@ -232,7 +234,8 @@ jobs: - name: Install Requirements run: | brew install coreutils - python3 -m pip install -U --user pip Pyinstaller -r requirements.txt + python3 -m pip install -U --user pip setuptools wheel + python3 -m pip install -U --user Pyinstaller -r requirements.txt - name: Prepare run: | From c4efa0aefec8daef1de62fd1693f13edf3c8b03c Mon Sep 17 00:00:00 2001 From: bashonly Date: Sat, 20 May 2023 11:08:50 -0500 Subject: [PATCH 12/75] [build] Various build workflow improvements - Wait for build before publishing to PyPI - Do not run `meta_files` job if release is cancelled - Customizable channel in release workflow - Display badges above changelog Authored by: bashonly, Grub4K --- .github/workflows/build.yml | 4 +- .github/workflows/publish.yml | 46 +++++++++++------ .github/workflows/release-nightly.yml | 3 +- .github/workflows/release.yml | 72 ++++++++++++++++++++------- devscripts/update-version.py | 2 +- 5 files changed, 90 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bec0576d1e..d038e693d9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,7 +41,7 @@ on: required: true type: string channel: - description: Update channel (stable/nightly) + description: Update channel (stable/nightly/...) required: true default: stable type: string @@ -316,7 +316,7 @@ jobs: dist/yt-dlp_x86.exe meta_files: - if: inputs.meta_files && always() + if: inputs.meta_files && always() && !cancelled() needs: - unix - linux_arm diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 8a1bd9a010..3ca5c69924 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -2,16 +2,20 @@ name: Publish on: workflow_call: inputs: - nightly: - default: false - required: false - type: boolean + channel: + default: stable + required: true + type: string version: required: true type: string target_commitish: required: true type: string + prerelease: + default: false + required: true + type: boolean secrets: ARCHIVE_REPO_TOKEN: required: false @@ -34,6 +38,19 @@ jobs: - name: Generate release notes run: | + printf '%s' \ + '[![Installation](https://img.shields.io/badge/-Which%20file%20should%20I%20download%3F-white.svg?style=for-the-badge)]' \ + '(https://github.com/yt-dlp/yt-dlp#installation "Installation instructions") ' \ + '[![Documentation](https://img.shields.io/badge/-Docs-brightgreen.svg?style=for-the-badge&logo=GitBook&labelColor=555555)]' \ + '(https://github.com/yt-dlp/yt-dlp/tree/2023.03.04#readme "Documentation") ' \ + '[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)]' \ + '(https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators "Donate") ' \ + '[![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)]' \ + '(https://discord.gg/H5MNcFW63r "Discord") ' \ + ${{ inputs.channel != 'nightly' && '"[![Nightly](https://img.shields.io/badge/Get%20nightly%20builds-purple.svg?style=for-the-badge)]" \ + "(https://github.com/yt-dlp/yt-dlp-nightly-builds/releases/latest \"Nightly builds\")"' || '' }} \ + > ./RELEASE_NOTES + printf '\n\n' >> ./RELEASE_NOTES cat >> ./RELEASE_NOTES << EOF #### A description of the various files are in the [README](https://github.com/yt-dlp/yt-dlp#release-files) --- @@ -41,9 +58,9 @@ jobs: $(python ./devscripts/make_changelog.py -vv) EOF - echo "**This is an automated nightly pre-release build**" >> ./PRERELEASE_NOTES - cat ./RELEASE_NOTES >> ./PRERELEASE_NOTES - echo "Generated from: https://github.com/${{ github.repository }}/commit/${{ inputs.target_commitish }}" >> ./ARCHIVE_NOTES + printf '%s\n\n' '**This is an automated nightly pre-release build**' >> ./NIGHTLY_NOTES + cat ./RELEASE_NOTES >> ./NIGHTLY_NOTES + printf '%s\n\n' 'Generated from: https://github.com/${{ github.repository }}/commit/${{ inputs.target_commitish }}' >> ./ARCHIVE_NOTES cat ./RELEASE_NOTES >> ./ARCHIVE_NOTES - name: Archive nightly release @@ -51,7 +68,7 @@ jobs: GH_TOKEN: ${{ secrets.ARCHIVE_REPO_TOKEN }} GH_REPO: ${{ vars.ARCHIVE_REPO }} if: | - inputs.nightly && env.GH_TOKEN != '' && env.GH_REPO != '' + inputs.channel == 'nightly' && env.GH_TOKEN != '' && env.GH_REPO != '' run: | gh release create \ --notes-file ARCHIVE_NOTES \ @@ -60,7 +77,7 @@ jobs: artifact/* - name: Prune old nightly release - if: inputs.nightly && !vars.ARCHIVE_REPO + if: inputs.channel == 'nightly' && !vars.ARCHIVE_REPO env: GH_TOKEN: ${{ github.token }} run: | @@ -68,14 +85,15 @@ jobs: git tag --delete "nightly" || true sleep 5 # Enough time to cover deletion race condition - - name: Publish release${{ inputs.nightly && ' (nightly)' || '' }} + - name: Publish release${{ inputs.channel == 'nightly' && ' (nightly)' || '' }} env: GH_TOKEN: ${{ github.token }} - if: (inputs.nightly && !vars.ARCHIVE_REPO) || !inputs.nightly + if: (inputs.channel == 'nightly' && !vars.ARCHIVE_REPO) || inputs.channel != 'nightly' run: | gh release create \ - --notes-file ${{ inputs.nightly && 'PRE' || '' }}RELEASE_NOTES \ + --notes-file ${{ inputs.channel == 'nightly' && 'NIGHTLY_NOTES' || 'RELEASE_NOTES' }} \ --target ${{ inputs.target_commitish }} \ - --title "yt-dlp ${{ inputs.nightly && 'nightly ' || '' }}${{ inputs.version }}" \ - ${{ inputs.nightly && '--prerelease "nightly"' || inputs.version }} \ + --title "yt-dlp ${{ inputs.channel == 'nightly' && 'nightly ' || '' }}${{ inputs.version }}" \ + ${{ inputs.prerelease && '--prerelease' || '' }} \ + ${{ inputs.channel == 'nightly' && '"nightly"' || inputs.version }} \ artifact/* diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index d4f01ab649..543e2e6f78 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -46,6 +46,7 @@ jobs: permissions: contents: write with: - nightly: true + channel: nightly + prerelease: true version: ${{ needs.prepare.outputs.version }} target_commitish: ${{ github.sha }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e07fc0c077..ada508be82 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,22 @@ name: Release -on: workflow_dispatch +on: + workflow_dispatch: + inputs: + version: + description: Version tag (YYYY.MM.DD[.REV]) + required: false + default: '' + type: string + channel: + description: Update channel (stable/nightly/...) + required: false + default: '' + type: string + prerelease: + description: Pre-release + default: false + type: boolean + permissions: contents: read @@ -9,8 +26,9 @@ jobs: contents: write runs-on: ubuntu-latest outputs: + channel: ${{ steps.set_channel.outputs.channel }} version: ${{ steps.update_version.outputs.version }} - head_sha: ${{ steps.push_release.outputs.head_sha }} + head_sha: ${{ steps.get_target.outputs.head_sha }} steps: - uses: actions/checkout@v3 @@ -21,10 +39,18 @@ jobs: with: python-version: "3.10" + - name: Set channel + id: set_channel + run: | + CHANNEL="${{ github.repository == 'yt-dlp/yt-dlp' && 'stable' || github.repository }}" + echo "channel=${{ inputs.channel || '$CHANNEL' }}" > "$GITHUB_OUTPUT" + - name: Update version id: update_version run: | - python devscripts/update-version.py ${{ vars.PUSH_VERSION_COMMIT == '' && '"$(date -u +"%H%M%S")"' || '' }} | \ + REVISION="${{ vars.PUSH_VERSION_COMMIT == '' && '$(date -u +"%H%M%S")' || '' }}" + REVISION="${{ inputs.prerelease && '$(date -u +"%H%M%S")' || '$REVISION' }}" + python devscripts/update-version.py ${{ inputs.version || '$REVISION' }} | \ grep -Po "version=\d+\.\d+\.\d+(\.\d+)?" >> "$GITHUB_OUTPUT" - name: Update documentation @@ -39,6 +65,7 @@ jobs: - name: Push to release id: push_release + if: ${{ !inputs.prerelease }} run: | git config --global user.name github-actions git config --global user.email github-actions@example.com @@ -46,14 +73,30 @@ jobs: git commit -m "Release ${{ steps.update_version.outputs.version }}" \ -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" git push origin --force ${{ github.event.ref }}:release + + - name: Get target commitish + id: get_target + run: | echo "head_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - name: Update master - if: vars.PUSH_VERSION_COMMIT != '' + if: vars.PUSH_VERSION_COMMIT != '' && !inputs.prerelease run: git push origin ${{ github.event.ref }} - publish_pypi_homebrew: + build: needs: prepare + uses: ./.github/workflows/build.yml + with: + version: ${{ needs.prepare.outputs.version }} + channel: ${{ needs.prepare.outputs.channel }} + permissions: + contents: read + packages: write # For package cache + secrets: + GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} + + publish_pypi_homebrew: + needs: [prepare, build] runs-on: ubuntu-latest steps: @@ -77,7 +120,7 @@ jobs: env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - if: env.TWINE_PASSWORD != '' + if: env.TWINE_PASSWORD != '' && !inputs.prerelease run: | rm -rf dist/* make pypi-files @@ -89,7 +132,7 @@ jobs: env: BREW_TOKEN: ${{ secrets.BREW_TOKEN }} PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' + if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' && !inputs.prerelease uses: actions/checkout@v3 with: repository: yt-dlp/homebrew-taps @@ -100,7 +143,7 @@ jobs: env: BREW_TOKEN: ${{ secrets.BREW_TOKEN }} PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' + if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' && !inputs.prerelease run: | python devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ needs.prepare.outputs.version }}" git -C taps/ config user.name github-actions @@ -108,22 +151,13 @@ jobs: git -C taps/ commit -am 'yt-dlp: ${{ needs.prepare.outputs.version }}' git -C taps/ push - build: - needs: prepare - uses: ./.github/workflows/build.yml - with: - version: ${{ needs.prepare.outputs.version }} - permissions: - contents: read - packages: write # For package cache - secrets: - GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} - publish: needs: [prepare, build] uses: ./.github/workflows/publish.yml permissions: contents: write with: + channel: ${{ needs.prepare.outputs.channel }} + prerelease: ${{ inputs.prerelease }} version: ${{ needs.prepare.outputs.version }} target_commitish: ${{ needs.prepare.outputs.head_sha }} diff --git a/devscripts/update-version.py b/devscripts/update-version.py index d888be8814..c873d10a5d 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -51,7 +51,7 @@ def get_git_head(): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Update the version.py file') parser.add_argument( - '-c', '--channel', choices=['stable', 'nightly'], default='stable', + '-c', '--channel', default='stable', help='Select update channel (default: %(default)s)') parser.add_argument( '-o', '--output', default='yt_dlp/version.py', From b73193c99aa23b135732408a5fcf655c68d731c6 Mon Sep 17 00:00:00 2001 From: bashonly Date: Sat, 20 May 2023 11:12:18 -0500 Subject: [PATCH 13/75] [build] Implement build verification using `--update-to` Authored by: bashonly, Grub4K --- .github/workflows/build.yml | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d038e693d9..ac0cfdf7cb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -127,6 +127,19 @@ jobs: mv ./dist/yt-dlp_linux ./yt-dlp_linux mv ./dist/yt-dlp_linux.zip ./yt-dlp_linux.zip + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + binaries=("yt-dlp" "yt-dlp_linux") + for binary in "${binaries[@]}"; do + chmod +x ./${binary} + cp ./${binary} ./${binary}_downgraded + version="$(./${binary} --version)" + ./${binary}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./${binary}_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + done + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -176,6 +189,16 @@ jobs: python3.8 devscripts/make_lazy_extractors.py python3.8 pyinst.py + if ${{ vars.UPDATE_TO_VERIFICATION && 'true' || 'false' }}; then + arch="${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}" + chmod +x ./dist/yt-dlp_linux_${arch} + cp ./dist/yt-dlp_linux_${arch} ./dist/yt-dlp_linux_${arch}_downgraded + version="$(./dist/yt-dlp_linux_${arch} --version)" + ./dist/yt-dlp_linux_${arch}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_linux_${arch}_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + fi + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -206,6 +229,16 @@ jobs: (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) python3 pyinst.py --target-architecture universal2 + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./dist/yt-dlp_macos + cp ./dist/yt-dlp_macos ./dist/yt-dlp_macos_downgraded + version="$(./dist/yt-dlp_macos --version)" + ./dist/yt-dlp_macos_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_macos_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -246,6 +279,16 @@ jobs: python3 pyinst.py mv dist/yt-dlp_macos dist/yt-dlp_macos_legacy + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./dist/yt-dlp_macos_legacy + cp ./dist/yt-dlp_macos_legacy ./dist/yt-dlp_macos_legacy_downgraded + version="$(./dist/yt-dlp_macos_legacy --version)" + ./dist/yt-dlp_macos_legacy_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_macos_legacy_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -278,6 +321,19 @@ jobs: python pyinst.py --onedir Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + foreach ($name in @("yt-dlp","yt-dlp_min")) { + Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" + $version = & "./dist/${name}.exe" --version + & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 + $downgraded_version = & "./dist/${name}_downgraded.exe" --version + if ($version -eq $downgraded_version) { + exit 1 + } + } + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -309,6 +365,19 @@ jobs: run: | python pyinst.py + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + foreach ($name in @("yt-dlp_x86")) { + Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" + $version = & "./dist/${name}.exe" --version + & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 + $downgraded_version = & "./dist/${name}_downgraded.exe" --version + if ($version -eq $downgraded_version) { + exit 1 + } + } + - name: Upload artifacts uses: actions/upload-artifact@v3 with: From 23c39a4beadee382060bb47fdaa21316ca707d38 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 3 Apr 2023 07:22:11 +0200 Subject: [PATCH 14/75] [devscripts] `make_changelog`: Various improvements - Make single items collapse into one line - Don't hide "Important changes" in `
` - Move upstream merge into priority - Properly support comma separated prefixes Authored by: Grub4K --- .github/workflows/publish.yml | 4 +- devscripts/make_changelog.py | 187 +++++++++++++++++++--------------- 2 files changed, 106 insertions(+), 85 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3ca5c69924..9ebf54e7fc 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -54,9 +54,7 @@ jobs: cat >> ./RELEASE_NOTES << EOF #### A description of the various files are in the [README](https://github.com/yt-dlp/yt-dlp#release-files) --- -

Changelog

- $(python ./devscripts/make_changelog.py -vv) -
+ $(python ./devscripts/make_changelog.py -vv --collapsible) EOF printf '%s\n\n' '**This is an automated nightly pre-release build**' >> ./NIGHTLY_NOTES cat ./RELEASE_NOTES >> ./NIGHTLY_NOTES diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index b159bc1b9b..1b7e251ee9 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -26,7 +26,6 @@ class CommitGroup(enum.Enum): - UPSTREAM = None PRIORITY = 'Important' CORE = 'Core' EXTRACTOR = 'Extractor' @@ -34,6 +33,11 @@ class CommitGroup(enum.Enum): POSTPROCESSOR = 'Postprocessor' MISC = 'Misc.' + @classmethod + @property + def ignorable_prefixes(cls): + return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream') + @classmethod @lru_cache def commit_lookup(cls): @@ -41,7 +45,6 @@ def commit_lookup(cls): name: group for group, names in { cls.PRIORITY: {''}, - cls.UPSTREAM: {'upstream'}, cls.CORE: { 'aes', 'cache', @@ -54,6 +57,7 @@ def commit_lookup(cls): 'outtmpl', 'plugins', 'update', + 'upstream', 'utils', }, cls.MISC: { @@ -111,22 +115,36 @@ def key(self): return ((self.details or '').lower(), self.sub_details, self.message) +def unique(items): + return sorted({item.strip().lower(): item for item in items if item}.values()) + + class Changelog: MISC_RE = re.compile(r'(?:^|\b)(?:lint(?:ing)?|misc|format(?:ting)?|fixes)(?:\b|$)', re.IGNORECASE) + ALWAYS_SHOWN = (CommitGroup.PRIORITY,) - def __init__(self, groups, repo): + def __init__(self, groups, repo, collapsible=False): self._groups = groups self._repo = repo + self._collapsible = collapsible def __str__(self): return '\n'.join(self._format_groups(self._groups)).replace('\t', ' ') def _format_groups(self, groups): + first = True for item in CommitGroup: + if self._collapsible and item not in self.ALWAYS_SHOWN and first: + first = False + yield '\n

Changelog

\n' + group = groups[item] if group: yield self.format_module(item.value, group) + if self._collapsible: + yield '\n
' + def format_module(self, name, group): result = f'\n#### {name} changes\n' if name else '\n' return result + '\n'.join(self._format_group(group)) @@ -137,62 +155,52 @@ def _format_group(self, group): for _, items in detail_groups: items = list(items) details = items[0].details - if not details: - indent = '' - else: - yield f'- {details}' - indent = '\t' if details == 'cleanup': - items, cleanup_misc_items = self._filter_cleanup_misc_items(items) + items = self._prepare_cleanup_misc_items(items) + + prefix = '-' + if details: + if len(items) == 1: + prefix = f'- **{details}**:' + else: + yield f'- **{details}**' + prefix = '\t-' sub_detail_groups = itertools.groupby(items, lambda item: tuple(map(str.lower, item.sub_details))) for sub_details, entries in sub_detail_groups: if not sub_details: for entry in entries: - yield f'{indent}- {self.format_single_change(entry)}' + yield f'{prefix} {self.format_single_change(entry)}' continue entries = list(entries) - prefix = f'{indent}- {", ".join(entries[0].sub_details)}' + sub_prefix = f'{prefix} {", ".join(entries[0].sub_details)}' if len(entries) == 1: - yield f'{prefix}: {self.format_single_change(entries[0])}' + yield f'{sub_prefix}: {self.format_single_change(entries[0])}' continue - yield prefix + yield sub_prefix for entry in entries: - yield f'{indent}\t- {self.format_single_change(entry)}' + yield f'\t{prefix} {self.format_single_change(entry)}' - if details == 'cleanup' and cleanup_misc_items: - yield from self._format_cleanup_misc_sub_group(cleanup_misc_items) - - def _filter_cleanup_misc_items(self, items): + def _prepare_cleanup_misc_items(self, items): cleanup_misc_items = defaultdict(list) - non_misc_items = [] + sorted_items = [] for item in items: if self.MISC_RE.search(item.message): cleanup_misc_items[tuple(item.commit.authors)].append(item) else: - non_misc_items.append(item) + sorted_items.append(item) - return non_misc_items, cleanup_misc_items + for commit_infos in cleanup_misc_items.values(): + sorted_items.append(CommitInfo( + 'cleanup', ('Miscellaneous',), ', '.join( + self._format_message_link(None, info.commit.hash) + for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), + [], Commit(None, '', commit_infos[0].commit.authors), [])) - def _format_cleanup_misc_sub_group(self, group): - prefix = '\t- Miscellaneous' - if len(group) == 1: - yield f'{prefix}: {next(self._format_cleanup_misc_items(group))}' - return - - yield prefix - for message in self._format_cleanup_misc_items(group): - yield f'\t\t- {message}' - - def _format_cleanup_misc_items(self, group): - for authors, infos in group.items(): - message = ', '.join( - self._format_message_link(None, info.commit.hash) - for info in sorted(infos, key=lambda item: item.commit.hash or '')) - yield f'{message} by {self._format_authors(authors)}' + return sorted_items def format_single_change(self, info): message = self._format_message_link(info.message, info.commit.hash) @@ -236,12 +244,8 @@ class CommitRange: AUTHOR_INDICATOR_RE = re.compile(r'Authored by:? ', re.IGNORECASE) MESSAGE_RE = re.compile(r''' - (?:\[ - (?P[^\]\/:,]+) - (?:/(?P
[^\]:,]+))? - (?:[:,](?P[^\]]+))? - \]\ )? - (?:(?P`?[^:`]+`?): )? + (?:\[(?P[^\]]+)\]\ )? + (?:(?P`?[^:`]+`?): )? (?P.+?) (?:\ \((?P\#\d+(?:,\ \#\d+)*)\))? ''', re.VERBOSE | re.DOTALL) @@ -340,60 +344,76 @@ def apply_overrides(self, overrides): self._commits = {key: value for key, value in reversed(self._commits.items())} def groups(self): - groups = defaultdict(list) + group_dict = defaultdict(list) for commit in self: - upstream_re = self.UPSTREAM_MERGE_RE.match(commit.short) + upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) if upstream_re: - commit.short = f'[upstream] Merge up to youtube-dl {upstream_re.group(1)}' + commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}' match = self.MESSAGE_RE.fullmatch(commit.short) if not match: logger.error(f'Error parsing short commit message: {commit.short!r}') continue - prefix, details, sub_details, sub_details_alt, message, issues = match.groups() - group = None - if prefix: - if prefix == 'priority': - prefix, _, details = (details or '').partition('/') - logger.debug(f'Priority: {message!r}') - group = CommitGroup.PRIORITY - - if not details and prefix: - if prefix not in ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream'): - logger.debug(f'Replaced details with {prefix!r}') - details = prefix or None - - if details == 'common': - details = None - - if details: - details = details.strip() - - else: - group = CommitGroup.CORE - - sub_details = f'{sub_details or ""},{sub_details_alt or ""}'.replace(':', ',') - sub_details = tuple(filter(None, map(str.strip, sub_details.split(',')))) - + prefix, sub_details_alt, message, issues = match.groups() issues = [issue.strip()[1:] for issue in issues.split(',')] if issues else [] + if prefix: + groups, details, sub_details = zip(*map(self.details_from_prefix, prefix.split(','))) + group = next(iter(filter(None, groups)), None) + details = ', '.join(unique(details)) + sub_details = list(itertools.chain.from_iterable(sub_details)) + else: + group = CommitGroup.CORE + details = None + sub_details = [] + + if sub_details_alt: + sub_details.append(sub_details_alt) + sub_details = tuple(unique(sub_details)) + if not group: - group = CommitGroup.get(prefix.lower()) - if not group: - if self.EXTRACTOR_INDICATOR_RE.search(commit.short): - group = CommitGroup.EXTRACTOR - else: - group = CommitGroup.POSTPROCESSOR - logger.warning(f'Failed to map {commit.short!r}, selected {group.name}') + if self.EXTRACTOR_INDICATOR_RE.search(commit.short): + group = CommitGroup.EXTRACTOR + else: + group = CommitGroup.POSTPROCESSOR + logger.warning(f'Failed to map {commit.short!r}, selected {group.name.lower()}') commit_info = CommitInfo( details, sub_details, message.strip(), issues, commit, self._fixes[commit.hash]) - logger.debug(f'Resolved {commit.short!r} to {commit_info!r}') - groups[group].append(commit_info) - return groups + logger.debug(f'Resolved {commit.short!r} to {commit_info!r}') + group_dict[group].append(commit_info) + + return group_dict + + @staticmethod + def details_from_prefix(prefix): + if not prefix: + return CommitGroup.CORE, None, () + + prefix, _, details = prefix.partition('/') + prefix = prefix.strip().lower() + details = details.strip() + + group = CommitGroup.get(prefix) + if group is CommitGroup.PRIORITY: + prefix, _, details = details.partition('/') + + if not details and prefix and prefix not in CommitGroup.ignorable_prefixes: + logger.debug(f'Replaced details with {prefix!r}') + details = prefix or None + + if details == 'common': + details = None + + if details: + details, *sub_details = details.split(':') + else: + sub_details = [] + + return group, details, sub_details def get_new_contributors(contributors_path, commits): @@ -444,6 +464,9 @@ def get_new_contributors(contributors_path, commits): parser.add_argument( '--repo', default='yt-dlp/yt-dlp', help='the github repository to use for the operations (default: %(default)s)') + parser.add_argument( + '--collapsible', action='store_true', + help='make changelog collapsible (default: %(default)s)') args = parser.parse_args() logging.basicConfig( @@ -467,4 +490,4 @@ def get_new_contributors(contributors_path, commits): write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') logger.info(f'New contributors: {", ".join(new_contributors)}') - print(Changelog(commits.groups(), args.repo)) + print(Changelog(commits.groups(), args.repo, args.collapsible)) From 69bec6730ec9d724bcedeab199d9d684d61423ba Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 21 May 2023 09:56:23 +1200 Subject: [PATCH 15/75] [cleanup, utils] Split into submodules (#7090) Closes https://github.com/yt-dlp/yt-dlp/pull/2173 Authored by: pukkandan, coletdjnz Co-authored-by: pukkandan --- Makefile | 2 +- setup.cfg | 1 + yt_dlp/YoutubeDL.py | 2 - yt_dlp/utils/__init__.py | 14 + yt_dlp/utils/_deprecated.py | 30 ++ yt_dlp/utils/_legacy.py | 163 ++++++++++ yt_dlp/{utils.py => utils/_utils.py} | 458 +-------------------------- yt_dlp/utils/traversal.py | 254 +++++++++++++++ 8 files changed, 480 insertions(+), 444 deletions(-) create mode 100644 yt_dlp/utils/__init__.py create mode 100644 yt_dlp/utils/_deprecated.py create mode 100644 yt_dlp/utils/_legacy.py rename yt_dlp/{utils.py => utils/_utils.py} (92%) create mode 100644 yt_dlp/utils/traversal.py diff --git a/Makefile b/Makefile index d5d47629b9..f03fe20523 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ offlinetest: codetest $(PYTHON) -m pytest -k "not download" # XXX: This is hard to maintain -CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/dependencies +CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/utils yt_dlp/dependencies yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip for d in $(CODE_FOLDERS) ; do \ diff --git a/setup.cfg b/setup.cfg index 6deaa79715..68d9e516d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,6 +8,7 @@ ignore = E402,E501,E731,E741,W503 max_line_length = 120 per_file_ignores = devscripts/lazy_load_template.py: F401 + yt_dlp/utils/__init__.py: F401, F403 [autoflake] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 91aec1fe6e..b8f1a05a09 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -124,7 +124,6 @@ parse_filesize, preferredencoding, prepend_extension, - register_socks_protocols, remove_terminal_sequences, render_table, replace_extension, @@ -739,7 +738,6 @@ def check_deprecated(param, option, suggestion): when=when) self._setup_opener() - register_socks_protocols() def preload_download_archive(fn): """Preload the archive, if any is specified""" diff --git a/yt_dlp/utils/__init__.py b/yt_dlp/utils/__init__.py new file mode 100644 index 0000000000..74b39e2c7b --- /dev/null +++ b/yt_dlp/utils/__init__.py @@ -0,0 +1,14 @@ +import warnings + +from ..compat.compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5)) +del passthrough_module + +# isort: off +from .traversal import * +from ._utils import * +from ._utils import _configuration_args, _get_exe_version_output +from ._deprecated import * diff --git a/yt_dlp/utils/_deprecated.py b/yt_dlp/utils/_deprecated.py new file mode 100644 index 0000000000..4454d84a72 --- /dev/null +++ b/yt_dlp/utils/_deprecated.py @@ -0,0 +1,30 @@ +"""Deprecated - New code should avoid these""" + +from ._utils import preferredencoding + + +def encodeFilename(s, for_subprocess=False): + assert isinstance(s, str) + return s + + +def decodeFilename(b, for_subprocess=False): + return b + + +def decodeArgument(b): + return b + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, str) + return optval + + +def error_to_compat_str(err): + return str(err) diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py new file mode 100644 index 0000000000..cd009b504c --- /dev/null +++ b/yt_dlp/utils/_legacy.py @@ -0,0 +1,163 @@ +"""No longer used and new code should not use. Exists only for API compat.""" + +import platform +import struct +import sys +import urllib.parse +import zlib + +from ._utils import decode_base_n, preferredencoding +from .traversal import traverse_obj +from ..dependencies import certifi, websockets + +has_certifi = bool(certifi) +has_websockets = bool(websockets) + + +def load_plugins(name, suffix, namespace): + from ..plugins import load_plugins + ret = load_plugins(name, suffix) + namespace.update(ret) + return ret + + +def traverse_dict(dictn, keys, casesense=True): + return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) + + +def decode_base(value, digits): + return decode_base_n(value, table=digits) + + +def platform_name(): + """ Returns the platform name as a str """ + return platform.platform() + + +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + +# UNUSED +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise OSError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise OSError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels + + +def register_socks_protocols(): + # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in urllib.parse.uses_netloc: + urllib.parse.uses_netloc.append(scheme) diff --git a/yt_dlp/utils.py b/yt_dlp/utils/_utils.py similarity index 92% rename from yt_dlp/utils.py rename to yt_dlp/utils/_utils.py index 190af1b7d7..f032af9014 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils/_utils.py @@ -47,26 +47,18 @@ import xml.etree.ElementTree import zlib -from .compat import functools # isort: split -from .compat import ( +from . import traversal + +from ..compat import functools # isort: split +from ..compat import ( compat_etree_fromstring, compat_expanduser, compat_HTMLParseError, compat_os_name, compat_shlex_quote, ) -from .dependencies import brotli, certifi, websockets, xattr -from .socks import ProxyType, sockssocket - - -def register_socks_protocols(): - # "Register" SOCKS protocols - # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 - # URLs with protocols not in urlparse.uses_netloc are not handled correctly - for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in urllib.parse.uses_netloc: - urllib.parse.uses_netloc.append(scheme) - +from ..dependencies import brotli, certifi, websockets, xattr +from ..socks import ProxyType, sockssocket # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -928,27 +920,6 @@ def run(cls, *args, timeout=None, **kwargs): return stdout or default, stderr or default, proc.returncode -def get_subprocess_encoding(): - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return encoding - - -def encodeFilename(s, for_subprocess=False): - assert isinstance(s, str) - return s - - -def decodeFilename(b, for_subprocess=False): - return b - - def encodeArgument(s): # Legacy code that uses byte strings # Uncomment the following line after fixing all post processors @@ -956,20 +927,6 @@ def encodeArgument(s): return s if isinstance(s, str) else s.decode('ascii') -def decodeArgument(b): - return b - - -def decodeOption(optval): - if optval is None: - return optval - if isinstance(optval, bytes): - optval = optval.decode(preferredencoding()) - - assert isinstance(optval, str) - return optval - - _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds')) @@ -1034,7 +991,7 @@ def make_HTTPS_handler(params, **kwargs): context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE if opts_check_certificate: - if has_certifi and 'no-certifi' not in params.get('compat_opts', []): + if certifi and 'no-certifi' not in params.get('compat_opts', []): context.load_verify_locations(cafile=certifi.where()) else: try: @@ -1068,7 +1025,7 @@ def make_HTTPS_handler(params, **kwargs): def bug_reports_message(before=';'): - from .update import REPOSITORY + from ..update import REPOSITORY msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , ' 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U') @@ -2019,12 +1976,6 @@ def __eq__(self, other): and self.start == other.start and self.end == other.end) -def platform_name(): - """ Returns the platform name as a str """ - deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead') - return platform.platform() - - @functools.cache def system_identifier(): python_implementation = platform.python_implementation() @@ -2076,7 +2027,7 @@ def write_string(s, out=None, encoding=None): def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): - from . import _IN_CLI + from .. import _IN_CLI if _IN_CLI: if msg in deprecation_warning._cache: return @@ -3284,13 +3235,6 @@ def variadic(x, allowed_types=NO_DEFAULT): return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) -def dict_get(d, key_or_keys, default=None, skip_false_values=True): - for val in map(d.get, variadic(key_or_keys)): - if val is not None and (val or not skip_false_values): - return val - return default - - def try_call(*funcs, expected_type=None, args=[], kwargs={}): for f in funcs: try: @@ -3528,7 +3472,7 @@ def is_outdated_version(version, limit, assume_new=True): def ytdl_is_updateable(): """ Returns if yt-dlp can be updated with -U """ - from .update import is_non_updateable + from ..update import is_non_updateable return not is_non_updateable() @@ -3538,10 +3482,6 @@ def args_to_str(args): return ' '.join(compat_shlex_quote(a) for a in args) -def error_to_compat_str(err): - return str(err) - - def error_to_str(err): return f'{type(err).__name__}: {err}' @@ -3628,7 +3568,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): mimetype = mt.partition(';')[0].strip().lower() _, _, subtype = mimetype.rpartition('/') - ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) + ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) if ext: return ext elif default is not NO_DEFAULT: @@ -3660,7 +3600,7 @@ def parse_codecs(codecs_str): vcodec = full_codec if parts[0] in ('dvh1', 'dvhe'): hdr = 'DV' - elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10': + elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10': hdr = 'HDR10' elif parts[:2] == ['vp9', '2']: hdr = 'HDR10' @@ -3706,8 +3646,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): }, } - sanitize_codec = functools.partial( - try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower()) + sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) for ext in preferences or COMPATIBLE_CODECS.keys(): @@ -5088,12 +5027,6 @@ def decode_base_n(string, n=None, table=None): return result -def decode_base(value, digits): - deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed ' - f'in a future version. Use {__name__}.decode_base_n instead') - return decode_base_n(value, table=digits) - - def decode_packed_codes(code): mobj = re.search(PACKED_CODES_RE, code) obfuscated_code, base, count, symbols = mobj.groups() @@ -5138,113 +5071,6 @@ def urshift(val, n): return val >> n if val >= 0 else (val + 0x100000000) >> n -# Based on png2str() written by @gdkchan and improved by @yokrysty -# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 -def decode_png(png_data): - # Reference: https://www.w3.org/TR/PNG/ - header = png_data[8:] - - if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': - raise OSError('Not a valid PNG file.') - - int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] - - chunks = [] - - while header: - length = unpack_integer(header[:4]) - header = header[4:] - - chunk_type = header[:4] - header = header[4:] - - chunk_data = header[:length] - header = header[length:] - - header = header[4:] # Skip CRC - - chunks.append({ - 'type': chunk_type, - 'length': length, - 'data': chunk_data - }) - - ihdr = chunks[0]['data'] - - width = unpack_integer(ihdr[:4]) - height = unpack_integer(ihdr[4:8]) - - idat = b'' - - for chunk in chunks: - if chunk['type'] == b'IDAT': - idat += chunk['data'] - - if not idat: - raise OSError('Unable to read PNG data.') - - decompressed_data = bytearray(zlib.decompress(idat)) - - stride = width * 3 - pixels = [] - - def _get_pixel(idx): - x = idx % stride - y = idx // stride - return pixels[y][x] - - for y in range(height): - basePos = y * (1 + stride) - filter_type = decompressed_data[basePos] - - current_row = [] - - pixels.append(current_row) - - for x in range(stride): - color = decompressed_data[1 + basePos + x] - basex = y * stride + x - left = 0 - up = 0 - - if x > 2: - left = _get_pixel(basex - 3) - if y > 0: - up = _get_pixel(basex - stride) - - if filter_type == 1: # Sub - color = (color + left) & 0xff - elif filter_type == 2: # Up - color = (color + up) & 0xff - elif filter_type == 3: # Average - color = (color + ((left + up) >> 1)) & 0xff - elif filter_type == 4: # Paeth - a = left - b = up - c = 0 - - if x > 2 and y > 0: - c = _get_pixel(basex - stride - 3) - - p = a + b - c - - pa = abs(p - a) - pb = abs(p - b) - pc = abs(p - c) - - if pa <= pb and pa <= pc: - color = (color + a) & 0xff - elif pb <= pc: - color = (color + b) & 0xff - else: - color = (color + c) & 0xff - - current_row.append(color) - - return width, height, pixels - - def write_xattr(path, key, value): # Windows: Write xattrs to NTFS Alternate Data Streams: # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 @@ -5403,7 +5229,7 @@ def to_high_limit_path(path): def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): - val = traverse_obj(obj, *variadic(field)) + val = traversal.traverse_obj(obj, *variadic(field)) if not val if ignore is NO_DEFAULT else val in variadic(ignore): return default return template % func(val) @@ -5441,12 +5267,12 @@ def make_dir(path, to_screen=None): return True except OSError as err: if callable(to_screen) is not None: - to_screen('unable to create directory ' + error_to_compat_str(err)) + to_screen(f'unable to create directory {err}') return False def get_executable_path(): - from .update import _get_variant_and_executable_path + from ..update import _get_variant_and_executable_path return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) @@ -5470,244 +5296,6 @@ def get_system_config_dirs(package_name): yield os.path.join('/etc', package_name) -def traverse_obj( - obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, - casesense=True, is_user_input=False, traverse_string=False): - """ - Safely traverse nested `dict`s and `Iterable`s - - >>> obj = [{}, {"key": "value"}] - >>> traverse_obj(obj, (1, "key")) - "value" - - Each of the provided `paths` is tested and the first producing a valid result will be returned. - The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. - Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. - - The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. - - The keys in the path can be one of: - - `None`: Return the current object. - - `set`: Requires the only item in the set to be a type or function, - like `{type}`/`{func}`. If a `type`, returns only values - of this type. If a function, returns `func(obj)`. - - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - - `slice`: Branch out and return all values in `obj[key]`. - - `Ellipsis`: Branch out and return a list of all values. - - `tuple`/`list`: Branch out and return a list of all matching values. - Read as: `[traverse_obj(obj, branch) for branch in branches]`. - - `function`: Branch out and return values filtered by the function. - Read as: `[value for key, value in obj if function(key, value)]`. - For `Iterable`s, `key` is the index of the value. - For `re.Match`es, `key` is the group number (0 = full match) - as well as additionally any group names, if given. - - `dict` Transform the current object and return a matching dict. - Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - - `tuple`, `list`, and `dict` all support nested paths and branches. - - @params paths Paths which to traverse by. - @param default Value to return if the paths do not match. - If the last key in the path is a `dict`, it will apply to each value inside - the dict instead, depth first. Try to avoid if using nested `dict` keys. - @param expected_type If a `type`, only accept final values of this type. - If any other callable, try to call the function on each result. - If the last key in the path is a `dict`, it will apply to each value inside - the dict instead, recursively. This does respect branching paths. - @param get_all If `False`, return the first matching result, otherwise all matching ones. - @param casesense If `False`, consider string dictionary keys as case insensitive. - - The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API - - @param is_user_input Whether the keys are generated from user input. - If `True` strings get converted to `int`/`slice` if needed. - @param traverse_string Whether to traverse into objects as strings. - If `True`, any non-compatible object will first be - converted into a string and then traversed into. - The return value of that path will be a string instead, - not respecting any further branching. - - - @returns The result of the object traversal. - If successful, `get_all=True`, and the path branches at least once, - then a list of results is returned instead. - If no `default` is given and the last path branches, a `list` of results - is always returned. If a path ends on a `dict` that result will always be a `dict`. - """ - casefold = lambda k: k.casefold() if isinstance(k, str) else k - - if isinstance(expected_type, type): - type_test = lambda val: val if isinstance(val, expected_type) else None - else: - type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) - - def apply_key(key, obj, is_last): - branching = False - result = None - - if obj is None and traverse_string: - if key is ... or callable(key) or isinstance(key, slice): - branching = True - result = () - - elif key is None: - result = obj - - elif isinstance(key, set): - assert len(key) == 1, 'Set should only be used to wrap a single item' - item = next(iter(key)) - if isinstance(item, type): - if isinstance(obj, item): - result = obj - else: - result = try_call(item, args=(obj,)) - - elif isinstance(key, (list, tuple)): - branching = True - result = itertools.chain.from_iterable( - apply_path(obj, branch, is_last)[0] for branch in key) - - elif key is ...: - branching = True - if isinstance(obj, collections.abc.Mapping): - result = obj.values() - elif is_iterable_like(obj): - result = obj - elif isinstance(obj, re.Match): - result = obj.groups() - elif traverse_string: - branching = False - result = str(obj) - else: - result = () - - elif callable(key): - branching = True - if isinstance(obj, collections.abc.Mapping): - iter_obj = obj.items() - elif is_iterable_like(obj): - iter_obj = enumerate(obj) - elif isinstance(obj, re.Match): - iter_obj = itertools.chain( - enumerate((obj.group(), *obj.groups())), - obj.groupdict().items()) - elif traverse_string: - branching = False - iter_obj = enumerate(str(obj)) - else: - iter_obj = () - - result = (v for k, v in iter_obj if try_call(key, args=(k, v))) - if not branching: # string traversal - result = ''.join(result) - - elif isinstance(key, dict): - iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) - result = { - k: v if v is not None else default for k, v in iter_obj - if v is not None or default is not NO_DEFAULT - } or None - - elif isinstance(obj, collections.abc.Mapping): - result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else - next((v for k, v in obj.items() if casefold(k) == key), None)) - - elif isinstance(obj, re.Match): - if isinstance(key, int) or casesense: - with contextlib.suppress(IndexError): - result = obj.group(key) - - elif isinstance(key, str): - result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) - - elif isinstance(key, (int, slice)): - if is_iterable_like(obj, collections.abc.Sequence): - branching = isinstance(key, slice) - with contextlib.suppress(IndexError): - result = obj[key] - elif traverse_string: - with contextlib.suppress(IndexError): - result = str(obj)[key] - - return branching, result if branching else (result,) - - def lazy_last(iterable): - iterator = iter(iterable) - prev = next(iterator, NO_DEFAULT) - if prev is NO_DEFAULT: - return - - for item in iterator: - yield False, prev - prev = item - - yield True, prev - - def apply_path(start_obj, path, test_type): - objs = (start_obj,) - has_branched = False - - key = None - for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): - if is_user_input and isinstance(key, str): - if key == ':': - key = ... - elif ':' in key: - key = slice(*map(int_or_none, key.split(':'))) - elif int_or_none(key) is not None: - key = int(key) - - if not casesense and isinstance(key, str): - key = key.casefold() - - if __debug__ and callable(key): - # Verify function signature - inspect.signature(key).bind(None, None) - - new_objs = [] - for obj in objs: - branching, results = apply_key(key, obj, last) - has_branched |= branching - new_objs.append(results) - - objs = itertools.chain.from_iterable(new_objs) - - if test_type and not isinstance(key, (dict, list, tuple)): - objs = map(type_test, objs) - - return objs, has_branched, isinstance(key, dict) - - def _traverse_obj(obj, path, allow_empty, test_type): - results, has_branched, is_dict = apply_path(obj, path, test_type) - results = LazyList(item for item in results if item not in (None, {})) - if get_all and has_branched: - if results: - return results.exhaust() - if allow_empty: - return [] if default is NO_DEFAULT else default - return None - - return results[0] if results else {} if allow_empty and is_dict else None - - for index, path in enumerate(paths, 1): - result = _traverse_obj(obj, path, index == len(paths), True) - if result is not None: - return result - - return None if default is NO_DEFAULT else default - - -def traverse_dict(dictn, keys, casesense=True): - deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed ' - f'in a future version. Use "{__name__}.traverse_obj" instead') - return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) - - -def get_first(obj, *paths, **kwargs): - return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) - - def time_seconds(**kwargs): """ Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z) @@ -5803,7 +5391,7 @@ def number_of_digits(number): def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: - values = (traverse_obj(from_dict, variadic(v)) for v in values) + values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(str, filter(None, values))) @@ -6514,15 +6102,3 @@ def calculate_preference(self, format): format['abr'] = format.get('tbr') - format.get('vbr', 0) return tuple(self._calculate_field_preference(format, field) for field in self._order) - - -# Deprecated -has_certifi = bool(certifi) -has_websockets = bool(websockets) - - -def load_plugins(name, suffix, namespace): - from .plugins import load_plugins - ret = load_plugins(name, suffix) - namespace.update(ret) - return ret diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py new file mode 100644 index 0000000000..462c3ba5df --- /dev/null +++ b/yt_dlp/utils/traversal.py @@ -0,0 +1,254 @@ +import collections.abc +import contextlib +import inspect +import itertools +import re + +from ._utils import ( + IDENTITY, + NO_DEFAULT, + LazyList, + int_or_none, + is_iterable_like, + try_call, + variadic, +) + + +def traverse_obj( + obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, + casesense=True, is_user_input=False, traverse_string=False): + """ + Safely traverse nested `dict`s and `Iterable`s + + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" + + Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. + Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. + + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. + + The keys in the path can be one of: + - `None`: Return the current object. + - `set`: Requires the only item in the set to be a type or function, + like `{type}`/`{func}`. If a `type`, returns only values + of this type. If a function, returns `func(obj)`. + - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Iterable`s, `key` is the index of the value. + For `re.Match`es, `key` is the group number (0 = full match) + as well as additionally any group names, if given. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches. + + @params paths Paths which to traverse by. + @param default Value to return if the paths do not match. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, depth first. Try to avoid if using nested `dict` keys. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, recursively. This does respect branching paths. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + The return value of that path will be a string instead, + not respecting any further branching. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + If no `default` is given and the last path branches, a `list` of results + is always returned. If a path ends on a `dict` that result will always be a `dict`. + """ + casefold = lambda k: k.casefold() if isinstance(k, str) else k + + if isinstance(expected_type, type): + type_test = lambda val: val if isinstance(val, expected_type) else None + else: + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + + def apply_key(key, obj, is_last): + branching = False + result = None + + if obj is None and traverse_string: + if key is ... or callable(key) or isinstance(key, slice): + branching = True + result = () + + elif key is None: + result = obj + + elif isinstance(key, set): + assert len(key) == 1, 'Set should only be used to wrap a single item' + item = next(iter(key)) + if isinstance(item, type): + if isinstance(obj, item): + result = obj + else: + result = try_call(item, args=(obj,)) + + elif isinstance(key, (list, tuple)): + branching = True + result = itertools.chain.from_iterable( + apply_path(obj, branch, is_last)[0] for branch in key) + + elif key is ...: + branching = True + if isinstance(obj, collections.abc.Mapping): + result = obj.values() + elif is_iterable_like(obj): + result = obj + elif isinstance(obj, re.Match): + result = obj.groups() + elif traverse_string: + branching = False + result = str(obj) + else: + result = () + + elif callable(key): + branching = True + if isinstance(obj, collections.abc.Mapping): + iter_obj = obj.items() + elif is_iterable_like(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, re.Match): + iter_obj = itertools.chain( + enumerate((obj.group(), *obj.groups())), + obj.groupdict().items()) + elif traverse_string: + branching = False + iter_obj = enumerate(str(obj)) + else: + iter_obj = () + + result = (v for k, v in iter_obj if try_call(key, args=(k, v))) + if not branching: # string traversal + result = ''.join(result) + + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) + result = { + k: v if v is not None else default for k, v in iter_obj + if v is not None or default is not NO_DEFAULT + } or None + + elif isinstance(obj, collections.abc.Mapping): + result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else + next((v for k, v in obj.items() if casefold(k) == key), None)) + + elif isinstance(obj, re.Match): + if isinstance(key, int) or casesense: + with contextlib.suppress(IndexError): + result = obj.group(key) + + elif isinstance(key, str): + result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + + elif isinstance(key, (int, slice)): + if is_iterable_like(obj, collections.abc.Sequence): + branching = isinstance(key, slice) + with contextlib.suppress(IndexError): + result = obj[key] + elif traverse_string: + with contextlib.suppress(IndexError): + result = str(obj)[key] + + return branching, result if branching else (result,) + + def lazy_last(iterable): + iterator = iter(iterable) + prev = next(iterator, NO_DEFAULT) + if prev is NO_DEFAULT: + return + + for item in iterator: + yield False, prev + prev = item + + yield True, prev + + def apply_path(start_obj, path, test_type): + objs = (start_obj,) + has_branched = False + + key = None + for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): + if is_user_input and isinstance(key, str): + if key == ':': + key = ... + elif ':' in key: + key = slice(*map(int_or_none, key.split(':'))) + elif int_or_none(key) is not None: + key = int(key) + + if not casesense and isinstance(key, str): + key = key.casefold() + + if __debug__ and callable(key): + # Verify function signature + inspect.signature(key).bind(None, None) + + new_objs = [] + for obj in objs: + branching, results = apply_key(key, obj, last) + has_branched |= branching + new_objs.append(results) + + objs = itertools.chain.from_iterable(new_objs) + + if test_type and not isinstance(key, (dict, list, tuple)): + objs = map(type_test, objs) + + return objs, has_branched, isinstance(key, dict) + + def _traverse_obj(obj, path, allow_empty, test_type): + results, has_branched, is_dict = apply_path(obj, path, test_type) + results = LazyList(item for item in results if item not in (None, {})) + if get_all and has_branched: + if results: + return results.exhaust() + if allow_empty: + return [] if default is NO_DEFAULT else default + return None + + return results[0] if results else {} if allow_empty and is_dict else None + + for index, path in enumerate(paths, 1): + result = _traverse_obj(obj, path, index == len(paths), True) + if result is not None: + return result + + return None if default is NO_DEFAULT else default + + +def get_first(obj, *paths, **kwargs): + return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) + + +def dict_get(d, key_or_keys, default=None, skip_false_values=True): + for val in map(d.get, variadic(key_or_keys)): + if val is not None and (val or not skip_false_values): + return val + return default From 955c89584b66fcd0fcfab3e611f1edeb1ca63886 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 21 May 2023 10:55:09 +1200 Subject: [PATCH 16/75] [core] Deprecate internal `Youtubedl-no-compression` header (#6876) Authored by: coletdjnz --- yt_dlp/YoutubeDL.py | 4 +++- yt_dlp/downloader/external.py | 4 +--- yt_dlp/downloader/http.py | 4 ++-- yt_dlp/extractor/litv.py | 2 +- yt_dlp/utils/_legacy.py | 10 ++++++++++ yt_dlp/utils/_utils.py | 23 ++++++----------------- 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index b8f1a05a09..1162d2df1a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2380,7 +2380,9 @@ def restore_last_token(self): def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - + if 'Youtubedl-No-Compression' in res: # deprecated + res.pop('Youtubedl-No-Compression', None) + res['Accept-Encoding'] = 'identity' cookies = self._calc_cookies(info_dict['url']) if cookies: res['Cookie'] = cookies diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ee130c8270..007689a8c9 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -23,7 +23,6 @@ encodeArgument, encodeFilename, find_available_port, - handle_youtubedl_headers, remove_end, sanitized_Request, traverse_obj, @@ -529,10 +528,9 @@ def _call_downloader(self, tmpfilename, info_dict): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): - headers_dict = handle_youtubedl_headers(fmt['http_headers']) # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in headers_dict.items())]) + args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())]) if start_time: args += ['-ss', str(start_time)] diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index fa72d5722a..79f69b5d02 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -45,8 +45,8 @@ class DownloadContext(dict): ctx.tmpfilename = self.temp_name(filename) ctx.stream = None - # Do not include the Accept-Encoding header - headers = {'Youtubedl-no-compression': 'True'} + # Disable compression + headers = {'Accept-Encoding': 'identity'} add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 31826ac99e..0b792fb96f 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -113,7 +113,7 @@ def _real_extract(self, url): entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity' title = program_info['title'] + program_info.get('secondaryMark', '') description = program_info.get('description') diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index cd009b504c..b0578a1d6b 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -161,3 +161,13 @@ def register_socks_protocols(): for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): if scheme not in urllib.parse.uses_netloc: urllib.parse.uses_netloc.append(scheme) + + +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index f032af9014..9f1a127cdb 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1308,25 +1308,12 @@ def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_a return hc -def handle_youtubedl_headers(headers): - filtered_headers = headers - - if 'Youtubedl-no-compression' in filtered_headers: - filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} - del filtered_headers['Youtubedl-no-compression'] - - return filtered_headers - - class YoutubeDLHandler(urllib.request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-no-compression", which will be - removed before making the real request. + the standard headers to every HTTP request and handles gzipped, deflated and + brotli responses from web servers. Part of this code was copied from: @@ -1389,11 +1376,13 @@ def http_request(self, req): if h.capitalize() not in req.headers: req.add_header(h, v) + if 'Youtubedl-no-compression' in req.headers: # deprecated + req.headers.pop('Youtubedl-no-compression', None) + req.add_header('Accept-encoding', 'identity') + if 'Accept-encoding' not in req.headers: req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS)) - req.headers = handle_youtubedl_headers(req.headers) - return super().do_request_(req) def http_response(self, req, resp): From 69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2 Mon Sep 17 00:00:00 2001 From: kangalio Date: Mon, 22 May 2023 13:47:06 +0200 Subject: [PATCH 17/75] [extractor/youtube:music:search_url] Extract title (#7102) Authored by: kangalio Closes #7095 --- yt_dlp/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d089822f64..bd38900f2c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4579,8 +4579,11 @@ def _grid_entries(self, grid_renderer): def _music_reponsive_list_entry(self, renderer): video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) if video_id: + title = traverse_obj(renderer, ( + 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer', + 'text', 'runs', 0, 'text')) return self.url_result(f'https://music.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id) + ie=YoutubeIE.ie_key(), video_id=video_id, title=title) playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) if playlist_id: video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) From 46f1370e9af6f8af8762f67e27e5acb8f0c48a47 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 24 May 2023 23:29:30 +0530 Subject: [PATCH 18/75] [devscripts/cli_to_api] Add script --- devscripts/cli_to_api.py | 48 +++++++++++++++++++++++++++++++++++ yt_dlp/YoutubeDL.py | 8 +++--- yt_dlp/downloader/common.py | 7 ++--- yt_dlp/downloader/fragment.py | 4 +-- yt_dlp/utils/_utils.py | 6 +++-- 5 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 devscripts/cli_to_api.py diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py new file mode 100644 index 0000000000..b8b7cbcf1d --- /dev/null +++ b/devscripts/cli_to_api.py @@ -0,0 +1,48 @@ +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import yt_dlp +import yt_dlp.options + +create_parser = yt_dlp.options.create_parser + + +def parse_patched_options(opts): + patched_parser = create_parser() + patched_parser.defaults.update({ + 'ignoreerrors': False, + 'retries': 0, + 'fragment_retries': 0, + 'extract_flat': False, + 'concat_playlist': 'never', + }) + yt_dlp.options.__dict__['create_parser'] = lambda: patched_parser + try: + return yt_dlp.parse_options(opts) + finally: + yt_dlp.options.__dict__['create_parser'] = create_parser + + +default_opts = parse_patched_options([]).ydl_opts + + +def cli_to_api(opts, cli_defaults=False): + opts = (yt_dlp.parse_options if cli_defaults else parse_patched_options)(opts).ydl_opts + + diff = {k: v for k, v in opts.items() if default_opts[k] != v} + if 'postprocessors' in diff: + diff['postprocessors'] = [pp for pp in diff['postprocessors'] + if pp not in default_opts['postprocessors']] + return diff + + +if __name__ == '__main__': + from pprint import pprint + + print('\nThe arguments passed translate to:\n') + pprint(cli_to_api(sys.argv[1:])) + print('\nCombining these with the CLI defaults gives:\n') + pprint(cli_to_api(sys.argv[1:], True)) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 1162d2df1a..cd82b27727 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -280,7 +280,7 @@ class YoutubeDL: subtitles. The language can be prefixed with a "-" to exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing - daterange: A DateRange object, download only if the upload_date is in the range. + daterange: A utils.DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. False to disable filesystem cache. @@ -329,13 +329,13 @@ class YoutubeDL: 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Whether to resolve and process url_results further - * False: Always process (default) + * False: Always process. Default for API * True: Never process * 'in_playlist': Do not process inside playlist/multi_video * 'discard': Always process, but don't return the result from inside playlist/multi_video * 'discard_in_playlist': Same as "discard", but only for - playlists (not multi_video) + playlists (not multi_video). Default for CLI wait_for_video: If given, wait for scheduled streams to become available. The value should be a tuple containing the range (min_secs, max_secs) to wait between retries @@ -472,7 +472,7 @@ class YoutubeDL: can also be used The following options are used by the extractors: - extractor_retries: Number of times to retry for known errors + extractor_retries: Number of times to retry for known errors (default: 3) dynamic_mpd: Whether to process dynamic DASH manifests (default: True) hls_split_discontinuity: Split HLS playlists to different formats at discontinuities such as ad breaks (default: False) diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 077b29b41f..8f9bc05d6e 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -51,8 +51,9 @@ class FileDownloader: ratelimit: Download speed limit, in bytes/sec. continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) - retries: Number of times to retry for HTTP error 5xx - file_access_retries: Number of times to retry on file access error + retries: Number of times to retry for expected network errors. + Default is 0 for API, but 10 for CLI + file_access_retries: Number of times to retry on file access error (default: 3) buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. @@ -225,7 +226,7 @@ def error_callback(err, count, retries, *, fd): sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) def wrapper(self, func, *args, **kwargs): - for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self): + for retry in RetryManager(self.params.get('file_access_retries', 3), error_callback, fd=self): try: return func(self, *args, **kwargs) except OSError as err: diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 3dc638f523..8abf7760ba 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -34,8 +34,8 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH - and hlsnative only) + fragment_retries: Number of times to retry a fragment for HTTP error + (DASH and hlsnative only). Default is 0 for API, but 10 for CLI skip_unavailable_fragments: Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 9f1a127cdb..afcb2a1642 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -60,6 +60,8 @@ from ..dependencies import brotli, certifi, websockets, xattr from ..socks import ProxyType, sockssocket +__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module + # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -1957,8 +1959,8 @@ def __contains__(self, date): date = date_from_str(date) return self.start <= date <= self.end - def __str__(self): - return f'{self.start.isoformat()} - {self.end.isoformat()}' + def __repr__(self): + return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})' def __eq__(self, other): return (isinstance(other, DateRange) From 4823ec9f461512daa1b8ab362893bb86a6320b26 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 24 May 2023 23:30:43 +0530 Subject: [PATCH 19/75] Update to ytdl-commit-d1c6c5 [YouTube] [core] Improve platform debug log, based on yt-dlp https://github.com/ytdl-org/youtube-dl/commit/d1c6c5c4d618fa950813c0c71aede34a5ac851e9 Except: * 6ed34338285f722d0da312ce0af3a15a077a3e2a [jsinterp] Add short-cut evaluation for common expression * There was no performance improvement when tested with https://github.com/ytdl-org/youtube-dl/issues/30641 * e8de54bce50f6f77a4d7e8e80675f7003d5bf630 [core] Handle `/../` sequences in HTTP URLs * We plan to implement this differently --- test/test_jsinterp.py | 32 ++++++++++++++++++++++++++------ test/test_utils.py | 32 ++++++++++++++++++++++++++++++++ yt_dlp/downloader/common.py | 24 +++++++++++++++++------- yt_dlp/downloader/fragment.py | 33 ++++++++++++++++++++------------- yt_dlp/downloader/http.py | 3 ++- yt_dlp/extractor/aenetworks.py | 15 +++++++++++++-- yt_dlp/extractor/litv.py | 2 +- yt_dlp/extractor/youtube.py | 10 +++------- yt_dlp/jsinterp.py | 2 +- yt_dlp/utils/_utils.py | 12 ++++++++++-- 10 files changed, 125 insertions(+), 40 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 444909b84b..96274116b9 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -66,9 +66,8 @@ def test_assignments(self): self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) + @unittest.skip('Not implemented') def test_comments(self): - 'Skipping: Not yet fully implemented' - return self._test(''' function f() { var x = /* 1 + */ 2; @@ -100,10 +99,13 @@ def test_builtins(self): jsi = JSInterpreter('function f() { return NaN }') self.assertTrue(math.isnan(jsi.call_function('f'))) - self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', - 86000) - self._test('function f(dt) { return new Date(dt) - 0; }', - 86000, args=['Wednesday 31 December 1969 18:01:26 MDT']) + def test_date(self): + self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', 86000) + + jsi = JSInterpreter('function f(dt) { return new Date(dt) - 0; }') + self.assertEqual(jsi.call_function('f', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + self.assertEqual(jsi.call_function('f', '12/31/1969 18:01:26 MDT'), 86000) # m/d/y + self.assertEqual(jsi.call_function('f', '1 January 1970 00:00:00 UTC'), 0) def test_call(self): jsi = JSInterpreter(''' @@ -286,6 +288,19 @@ def test_regex(self): jsi = JSInterpreter(R'function f() { let a=[/[)\\]/]; return a[0]; }') self.assertEqual(jsi.call_function('f').pattern, r'[)\\]') + @unittest.skip('Not implemented') + def test_replace(self): + self._test('function f() { let a="data-name".replace("data-", ""); return a }', + 'name') + self._test('function f() { let a="data-name".replace(new RegExp("^.+-"), ""); return a; }', + 'name') + self._test('function f() { let a="data-name".replace(/^.+-/, ""); return a; }', + 'name') + self._test('function f() { let a="data-name".replace(/a/g, "o"); return a; }', + 'doto-nome') + self._test('function f() { let a="data-name".replaceAll("a", "o"); return a; }', + 'doto-nome') + def test_char_code_at(self): jsi = JSInterpreter('function f(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('f', 0), 116) @@ -311,6 +326,11 @@ def test_negative(self): self._test('function f(){return 2 - + + - -2;}', 0) self._test('function f(){return 2 + - + - -2;}', 0) + @unittest.skip('Not implemented') + def test_packed(self): + jsi = JSInterpreter('''function f(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') + self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index e1bf6ac20f..a22f25d730 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,6 +5,7 @@ import re import sys import unittest +import warnings sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -112,6 +113,7 @@ subtitles_filename, timeconvert, traverse_obj, + try_call, unescapeHTML, unified_strdate, unified_timestamp, @@ -123,6 +125,7 @@ urlencode_postdata, urljoin, urshift, + variadic, version_tuple, xpath_attr, xpath_element, @@ -1974,6 +1977,35 @@ def test_get_compatible_ext(self): self.assertEqual(get_compatible_ext( vcodecs=['av1'], acodecs=['mp4a'], vexts=['webm'], aexts=['m4a'], preferences=('webm', 'mkv')), 'mkv') + def test_try_call(self): + def total(*x, **kwargs): + return sum(x) + sum(kwargs.values()) + + self.assertEqual(try_call(None), None, + msg='not a fn should give None') + self.assertEqual(try_call(lambda: 1), 1, + msg='int fn with no expected_type should give int') + self.assertEqual(try_call(lambda: 1, expected_type=int), 1, + msg='int fn with expected_type int should give int') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with wrong expected_type should give None') + self.assertEqual(try_call(total, args=(0, 1, 0, ), expected_type=int), 1, + msg='fn should accept arglist') + self.assertEqual(try_call(total, kwargs={'a': 0, 'b': 1, 'c': 0}, expected_type=int), 1, + msg='fn should accept kwargs') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with no expected_type should give None') + self.assertEqual(try_call(lambda x: {}, total, args=(42, ), expected_type=int), 42, + msg='expect first int result with expected_type int') + + def test_variadic(self): + self.assertEqual(variadic(None), (None, )) + self.assertEqual(variadic('spam'), ('spam', )) + self.assertEqual(variadic('spam', allowed_types=dict), 'spam') + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam') + def test_traverse_obj(self): _TEST_DATA = { 100: 100, diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 8f9bc05d6e..c48a2ff8ac 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -139,17 +139,21 @@ def calc_percent(byte_counter, data_len): def format_percent(percent): return ' N/A%' if percent is None else f'{percent:>5.1f}%' - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, total=NO_DEFAULT, current=NO_DEFAULT): + if total is NO_DEFAULT: + rate, remaining = start_or_rate, now_or_remaining + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + + start, now = start_or_rate, now_or_remaining if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def calc_speed(start, now, bytes): @@ -166,6 +170,12 @@ def format_speed(speed): def format_retries(retries): return 'inf' if retries == float('inf') else int(retries) + @staticmethod + def filesize_or_none(unencoded_filename): + if os.path.isfile(unencoded_filename): + return os.path.getsize(unencoded_filename) + return 0 + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 8abf7760ba..6770815abb 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -121,6 +121,11 @@ def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_dat 'request_data': request_data, 'ctx_id': ctx.get('ctx_id'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none(self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = ctx['frag_resume_len'] = frag_resume_len + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False @@ -155,9 +160,7 @@ def _append_fragment(self, ctx, frag_content): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -173,12 +176,11 @@ def _prepare_frag_download(self, ctx): }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -187,7 +189,9 @@ def _prepare_frag_download(self, ctx): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + continuedl = self.params.get('continuedl', True) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -201,7 +205,12 @@ def _prepare_frag_download(self, ctx): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -274,12 +283,10 @@ def frag_progress_hook(s): else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes) + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) + if not ctx['live']: + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) @@ -297,7 +304,7 @@ def _finish_frag_download(self, ctx, info_dict): to_file = ctx['tmpfilename'] != '-' if to_file: - downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) + downloaded_bytes = self.filesize_or_none(ctx['filename']) else: downloaded_bytes = ctx['complete_frags_downloaded_bytes'] diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 79f69b5d02..e785f0d4ed 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -150,7 +150,8 @@ def establish_connection(): # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload - self.report_unable_to_resume() + elif range_start > 0: + self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index d7c401016c..f049a0fb3c 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -3,6 +3,8 @@ ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -72,7 +74,14 @@ def _extract_aetn_info(self, domain, filter_key, filter_value, url): requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -123,7 +132,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -140,6 +149,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -303,6 +313,7 @@ def _real_extract(self, url): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 0b792fb96f..19b298ec6c 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -4,8 +4,8 @@ from ..utils import ( ExtractorError, int_or_none, - traverse_obj, smuggle_url, + traverse_obj, unsmuggle_url, ) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index bd38900f2c..654bf5e6b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -66,7 +66,6 @@ variadic, ) - STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { @@ -2994,17 +2993,14 @@ def _parse_sig_js(self, jscode): r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'("|\')signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -4883,7 +4879,7 @@ def _extract_metadata_from_tabs(self, item_id, data): metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) if metadata_renderer: channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}), - ('channelUrl', {self.ucid_from_url})) + ('channelUrl', {self.ucid_from_url})) info.update({ 'channel': metadata_renderer.get('title'), 'channel_id': channel_id, diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 82974fb27b..1ef1f0823a 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -443,7 +443,7 @@ def dict_item(key, val): err = e pending = (None, False) - m = re.match(r'catch\s*(?P\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + m = re.match(fr'catch\s*(?P\(\s*{_NAME_RE}\s*\))?\{{', expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) if err: diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index afcb2a1642..238b0fe694 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -130,8 +130,13 @@ def random_user_agent(): } -NO_DEFAULT = object() -IDENTITY = lambda x: x +class NO_DEFAULT: + pass + + +def IDENTITY(x): + return x + ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', @@ -3223,6 +3228,9 @@ def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO def variadic(x, allowed_types=NO_DEFAULT): + if not isinstance(allowed_types, (tuple, type)): + deprecation_warning('allowed_types should be a tuple or a type') + allowed_types = tuple(allowed_types) return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) From 15b2d3db1d40b0437fca79d8874d392aa54b3cdd Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 24 May 2023 22:13:24 +0530 Subject: [PATCH 20/75] [misc] Add automatic duplicate issue detection --- .github/workflows/potential-duplicates.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/potential-duplicates.yml diff --git a/.github/workflows/potential-duplicates.yml b/.github/workflows/potential-duplicates.yml new file mode 100644 index 0000000000..1521ae20c0 --- /dev/null +++ b/.github/workflows/potential-duplicates.yml @@ -0,0 +1,20 @@ +name: Potential Duplicates +on: + issues: + types: [opened, edited] + +jobs: + run: + runs-on: ubuntu-latest + steps: + - uses: wow-actions/potential-duplicates@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + label: potential-duplicate + state: all + threshold: 0.7 + comment: | + This issue is potentially a duplicate of one of the following issues: + {{#issues}} + - #{{ number }} ({{ accuracy }}%) + {{/issues}} From 7aeda6cc9e73ada0b0a0b6a6748c66bef63a20a8 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 24 May 2023 23:05:20 +0530 Subject: [PATCH 21/75] [jsinterp] Do not compile regex --- test/test_jsinterp.py | 4 +++- yt_dlp/jsinterp.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 96274116b9..4d44e6efe6 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -8,7 +8,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math -import re from yt_dlp.jsinterp import JS_Undefined, JSInterpreter @@ -275,7 +274,9 @@ def test_object(self): def test_regex(self): self._test('function f() { let a=/,,[/,913,/](,)}/; }', None) + self._test('function f() { let a=/,,[/,913,/](,)}/; return a; }', R'/,,[/,913,/](,)}/0') + R''' # We are not compiling regex jsi = JSInterpreter('function f() { let a=/,,[/,913,/](,)}/; return a; }') self.assertIsInstance(jsi.call_function('f'), re.Pattern) @@ -287,6 +288,7 @@ def test_regex(self): jsi = JSInterpreter(R'function f() { let a=[/[)\\]/]; return a[0]; }') self.assertEqual(jsi.call_function('f').pattern, r'[)\\]') + ''' @unittest.skip('Not implemented') def test_replace(self): diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 1ef1f0823a..7c7940efd5 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -352,8 +352,10 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': flags, outer = self._regex_flags(outer) + # We don't support regex methods yet, so no point compiling it + inner = f'{inner}/{flags}' # Avoid https://github.com/python/cpython/issues/74534 - inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) + # inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) else: inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) if not outer: From 8417f26b8a819cd7ffcd4e000ca3e45033e670fb Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Wed, 24 May 2023 20:35:07 +0200 Subject: [PATCH 22/75] [core] Implement `--color` flag (#6904) Authored by: Grub4K --- README.md | 9 +++++++-- yt_dlp/YoutubeDL.py | 36 +++++++++++++++++++++++++++++++----- yt_dlp/__init__.py | 6 +++++- yt_dlp/downloader/common.py | 3 ++- yt_dlp/options.py | 24 +++++++++++++++++++++--- 5 files changed, 66 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d0eaba7477..25ed3b8441 100644 --- a/README.md +++ b/README.md @@ -425,8 +425,12 @@ ## General Options: --no-wait-for-video Do not wait for scheduled streams (default) --mark-watched Mark videos watched (even with --simulate) --no-mark-watched Do not mark videos watched (default) - --no-colors Do not emit color codes in output (Alias: - --no-colours) + --color [STREAM:]POLICY Whether to emit color codes in output, + optionally prefixed by the STREAM (stdout or + stderr) to apply the setting to. Can be one + of "always", "auto" (default), "never", or + "no_color" (use non color terminal + sequences). Can be used multiple times --compat-options OPTS Options that can help keep compatibility with youtube-dl or youtube-dlc configurations by reverting some of the @@ -2148,6 +2152,7 @@ #### Redundant options --playlist-end NUMBER -I :NUMBER --playlist-reverse -I ::-1 --no-playlist-reverse Default + --no-colors --color no_color #### Not recommended diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index cd82b27727..e1e5588363 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -415,7 +415,12 @@ class YoutubeDL: - Raise utils.DownloadCancelled(msg) to abort remaining downloads when a video is rejected. match_filter_func in utils.py is one example for this. - no_color: Do not emit color codes in output. + color: A Dictionary with output stream names as keys + and their respective color policy as values. + Can also just be a single color policy, + in which case it applies to all outputs. + Valid stream names are 'stdout' and 'stderr'. + Valid color policies are one of 'always', 'auto', 'no_color' or 'never'. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For HTTP header geo_bypass_country: @@ -537,6 +542,7 @@ class YoutubeDL: data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) + no_color: Same as `color='no_color'` """ _NUMERIC_FIELDS = { @@ -603,9 +609,24 @@ def __init__(self, params=None, auto_init=True): except Exception as e: self.write_debug(f'Failed to enable VT mode: {e}') + if self.params.get('no_color'): + if self.params.get('color') is not None: + self.report_warning('Overwriting params from "color" with "no_color"') + self.params['color'] = 'no_color' + + term_allow_color = os.environ.get('TERM', '').lower() != 'dumb' + + def process_color_policy(stream): + stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] + policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) + if policy in ('auto', None): + return term_allow_color and supports_terminal_sequences(stream) + assert policy in ('always', 'never', 'no_color') + return {'always': True, 'never': False}.get(policy, policy) + self._allow_colors = Namespace(**{ - type_: not self.params.get('no_color') and supports_terminal_sequences(stream) - for type_, stream in self._out_files.items_ if type_ != 'console' + name: process_color_policy(stream) + for name, stream in self._out_files.items_ if name != 'console' }) # The code is left like this to be reused for future deprecations @@ -974,7 +995,7 @@ def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_enc text = text.encode(encoding, 'ignore').decode(encoding) if fallback is not None and text != original_text: text = fallback - return format_text(text, f) if allow_colors else text if fallback is None else fallback + return format_text(text, f) if allow_colors is True else text if fallback is None else fallback def _format_out(self, *args, **kwargs): return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) @@ -3769,9 +3790,14 @@ def print_debug_header(self): def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) + additional_info = [] + if os.environ.get('TERM', '').lower() == 'dumb': + additional_info.append('dumb') if not supports_terminal_sequences(stream): from .utils import WINDOWS_VT_MODE # Must be imported locally - ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' + additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI') + if additional_info: + ret = f'{ret} ({",".join(additional_info)})' return ret encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 9563d784aa..137c9503f6 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -436,6 +436,10 @@ def metadataparser_actions(f): elif ed and proto == 'default': default_downloader = ed.get_basename() + for policy in opts.color.values(): + if policy not in ('always', 'auto', 'no_color', 'never'): + raise ValueError(f'"{policy}" is not a valid color policy') + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -894,7 +898,7 @@ def parse_options(argv=None): 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': opts.match_filter, - 'no_color': opts.no_color, + 'color': opts.color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, 'hls_use_mpegts': opts.hls_use_mpegts, diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index c48a2ff8ac..477ec3c8a0 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -296,7 +296,8 @@ def _prepare_multiline_status(self, lines=1): self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) - self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') + self._multiline.allow_colors = self.ydl._allow_colors.out and self.ydl._allow_colors.out != 'no_color' + self._multiline._HAVE_FULLCAP = self.ydl._allow_colors.out def _finish_multiline_status(self): self._multiline.end() diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 838d79fcb1..fecc274031 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -34,6 +34,7 @@ join_nonempty, orderedSet_from_options, remove_end, + variadic, write_string, ) from .version import CHANNEL, __version__ @@ -250,7 +251,7 @@ def _dict_from_options_callback( if multiple_args: val = [val, *value[1:]] elif default_key is not None: - keys, val = [default_key], value + keys, val = variadic(default_key), value else: raise optparse.OptionValueError( f'wrong {opt_str} formatting; it should be {option.metavar}, not "{value}"') @@ -440,8 +441,25 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not mark videos watched (default)') general.add_option( '--no-colors', '--no-colours', - action='store_true', dest='no_color', default=False, - help='Do not emit color codes in output (Alias: --no-colours)') + action='store_const', dest='color', const={ + 'stdout': 'no_color', + 'stderr': 'no_color', + }, + help=optparse.SUPPRESS_HELP) + general.add_option( + '--color', + dest='color', metavar='[STREAM:]POLICY', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'stdout|stderr', + 'default_key': ['stdout', 'stderr'], + 'process': str.strip, + }, help=( + 'Whether to emit color codes in output, optionally prefixed by ' + 'the STREAM (stdout or stderr) to apply the setting to. ' + 'Can be one of "always", "auto" (default), "never", or ' + '"no_color" (use non color terminal sequences). ' + 'Can be used multiple times')) general.add_option( '--compat-options', metavar='OPTS', dest='compat_opts', default=set(), type='str', From 032de83ea9ff2f4977d9c71a93bbc1775597b762 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Wed, 24 May 2023 20:45:15 +0200 Subject: [PATCH 23/75] [extractor/crunchyroll] Rework with support for movies, music and artists (#6237) This adds `CrunchyrollMusicIE` and `CrunchyrollArtistIE` extractors using the new, reworked base class and expands the `CrunchyrollBetaIE` with support for movies and movie listings and more complete metadata extraction Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/crunchyroll.py | 692 +++++++++++++++++++++++--------- 2 files changed, 499 insertions(+), 195 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index fd2bfa9a10..8984d4b167 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -406,6 +406,8 @@ from .crunchyroll import ( CrunchyrollBetaIE, CrunchyrollBetaShowIE, + CrunchyrollMusicIE, + CrunchyrollArtistIE, ) from .cspan import CSpanIE, CSpanCongressIE from .ctsnews import CtsNewsIE diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 1abffcd745..d4a21616ba 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,28 +1,37 @@ import base64 -import urllib.parse +import urllib.error from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, format_field, + int_or_none, join_nonempty, + parse_age_limit, + parse_count, parse_iso8601, qualities, + remove_start, + time_seconds, traverse_obj, - try_get, + url_or_none, + urlencode_postdata, ) class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - params = None + _AUTH_HEADERS = None + _API_ENDPOINT = None + _BASIC_AUTH = None + _QUERY = {} @property def is_logged_in(self): - return self._get_cookies(self._LOGIN_URL).get('etp_rt') + return self._get_cookies(self._BASE_URL).get('etp_rt') def _perform_login(self, username, password): if self.is_logged_in: @@ -35,7 +44,7 @@ def _perform_login(self, username, password): 'device_id': 'whatvalueshouldbeforweb', 'device_type': 'com.crunchyroll.static', 'access_token': 'giKq5eY27ny3cqz', - 'referer': self._LOGIN_URL + 'referer': f'{self._BASE_URL}/welcome/login' }) if upsell_response['code'] != 'ok': raise ExtractorError('Could not get session id') @@ -43,149 +52,89 @@ def _perform_login(self, username, password): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urllib.parse.urlencode({ + data=urlencode_postdata({ 'account': username, 'password': password, 'session_id': session_id - }).encode('ascii')) + })) if login_response['code'] != 'ok': raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) if not self.is_logged_in: raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _get_embedded_json(self, webpage, display_id): - initial_state = self._parse_json(self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) - app_config = self._parse_json(self._search_regex( - r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) - return initial_state, app_config + def _update_query(self, lang): + if lang in CrunchyrollBaseIE._QUERY: + return - def _get_params(self, lang): - if not CrunchyrollBaseIE.params: - if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): - grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' - else: - grant_type, key = 'client_id', 'anonClientId' + webpage = self._download_webpage( + f'{self._BASE_URL}/{lang}', None, note=f'Retrieving main page (lang={lang or None})') - initial_state, app_config = self._get_embedded_json(self._download_webpage( - f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') + initial_state = self._search_json(r'__INITIAL_STATE__\s*=', webpage, 'initial state', None) + CrunchyrollBaseIE._QUERY[lang] = traverse_obj(initial_state, { + 'locale': ('localization', 'locale'), + }) or None - auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={ - 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') - }, data=f'grant_type={grant_type}'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', None, note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - cms = policy_response.get('cms_web') - bucket = cms['bucket'] - params = { - 'Policy': cms['policy'], - 'Signature': cms['signature'], - 'Key-Pair-Id': cms['key_pair_id'] - } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale - CrunchyrollBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBaseIE.params + if CrunchyrollBaseIE._BASIC_AUTH: + return + app_config = self._search_json(r'__APP_CONFIG__\s*=', webpage, 'app config', None) + cx_api_param = app_config['cxApiParams']['accountAuthClientId' if self.is_logged_in else 'anonClientId'] + self.write_debug(f'Using cxApiParam={cx_api_param}') + CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() -class CrunchyrollBetaIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ - (?P(?:\w{2}(?:-\w{2})?/)?) - watch/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' - _TESTS = [{ - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', - 'info_dict': { - 'id': 'GY2P1Q98Y', - 'ext': 'mp4', - 'duration': 1380.241, - 'timestamp': 1459632600, - 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', - 'title': 'World Trigger Episode 73 – To the Future', - 'upload_date': '20160402', - 'series': 'World Trigger', - 'series_id': 'GR757DMKY', - 'season': 'World Trigger', - 'season_id': 'GR9P39NJ6', - 'season_number': 1, - 'episode': 'To the Future', - 'episode_number': 73, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:2', - }, - 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, - }, { - 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', - 'info_dict': { - 'id': 'GYE5WKQGR', - 'ext': 'mp4', - 'duration': 366.459, - 'timestamp': 1476788400, - 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', - 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', - 'upload_date': '20161018', - 'series': 'SHELTER', - 'series_id': 'GYGG09WWY', - 'season': 'SHELTER', - 'season_id': 'GR09MGK4R', - 'season_number': 1, - 'episode': 'Porter Robinson presents Shelter the Animation', - 'episode_number': 0, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:0', - }, - 'params': {'skip_download': True}, - 'skip': 'Video is Premium only', - }, { - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', - 'only_matching': True, - }, { - 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', - 'only_matching': True, - }] + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): + return - def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + assert CrunchyrollBaseIE._BASIC_AUTH, '_update_query needs to be called at least one time beforehand' + grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' + auth_response = self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', + headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) - episode_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'): - if self.is_logged_in: - raise ExtractorError('This video is for premium members only', expected=True) - else: - self.raise_login_required('This video is for premium members only') + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} + CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) - stream_response = self._download_json( - f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, - note='Retrieving stream info', query=params) - get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): + self._update_query(lang) + self._update_auth() - requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - hardsub_preference = qualities(requested_hardsubs[::-1]) + if not endpoint.startswith('/'): + endpoint = f'/{endpoint}' + + return self._download_json( + f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', + headers=CrunchyrollBaseIE._AUTH_HEADERS, query={**CrunchyrollBaseIE._QUERY[lang], **query}) + + def _call_api(self, path, internal_id, lang, note='api', query={}): + if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): + path = f'/content/v2/{self._API_ENDPOINT}/{path}' + + try: + result = self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) + except ExtractorError as error: + if isinstance(error.cause, urllib.error.HTTPError) and error.cause.code == 404: + return None + raise + + if not result: + raise ExtractorError(f'Unexpected response when downloading {note} JSON') + return result + + def _extract_formats(self, stream_response, display_id=None): requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - available_formats = {} - for stream_type, streams in get_streams('streams'): + for stream_type, streams in traverse_obj( + stream_response, (('streams', ('data', 0)), {dict.items}, ...)): if stream_type not in requested_formats: continue - for stream in streams.values(): - if not stream.get('url'): - continue + for stream in traverse_obj(streams, lambda _, v: v['url']): hardsub_lang = stream.get('hardsub_locale') or '' format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] if '' in available_formats and 'all' not in requested_hardsubs: full_format_langs = set(requested_hardsubs) self.to_screen( @@ -196,6 +145,8 @@ def _real_extract(self, url): else: full_format_langs = set(map(str.lower, available_formats)) + audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + hardsub_preference = qualities(requested_hardsubs[::-1]) formats = [] for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): if stream_type.endswith('hls'): @@ -214,63 +165,292 @@ def _real_extract(self, url): continue for f in adaptive_formats: if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') + f['language'] = audio_locale f['quality'] = hardsub_preference(hardsub_lang.lower()) formats.extend(adaptive_formats) - chapters = None + return formats + + def _extract_subtitles(self, data): + subtitles = {} + + for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): + subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] + + return subtitles + + +class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): + _API_ENDPOINT = 'cms' + _CMS_EXPIRY = None + + def _call_cms_api_signed(self, path, internal_id, lang, note='api'): + if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds(): + response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web'] + CrunchyrollCmsBaseIE._CMS_QUERY = { + 'Policy': response['policy'], + 'Signature': response['signature'], + 'Key-Pair-Id': response['key_pair_id'], + } + CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket'] + CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10 + + if not path.startswith('/cms/v2'): + path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}' + + return self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY) + + +class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'''(?x) + https?://(?:beta\.|www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + watch/(?!concert|musicvideo)(?P\w+)''' + _TESTS = [{ + # Premium only + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'info_dict': { + 'id': 'GY2P1Q98Y', + 'ext': 'mp4', + 'duration': 1380.241, + 'timestamp': 1459632600, + 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', + 'title': 'World Trigger Episode 73 – To the Future', + 'upload_date': '20160402', + 'series': 'World Trigger', + 'series_id': 'GR757DMKY', + 'season': 'World Trigger', + 'season_id': 'GR9P39NJ6', + 'season_number': 1, + 'episode': 'To the Future', + 'episode_number': 73, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'chapters': 'count:2', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, + }, { + # Premium only + 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', + 'info_dict': { + 'id': 'GYE5WKQGR', + 'ext': 'mp4', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard', + 'info_dict': { + 'id': 'GJWU2VKK3', + 'ext': 'mp4', + 'duration': 1420.054, + 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd', + 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard', + 'series': 'The Ice Guy and His Cool Female Colleague', + 'series_id': 'GW4HM75NP', + 'season': 'The Ice Guy and His Cool Female Colleague', + 'season_id': 'GY9PC21VE', + 'season_number': 1, + 'episode': 'Cherry Blossom Meeting and a Coming Blizzard', + 'episode_number': 1, + 'chapters': 'count:2', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'timestamp': 1672839000, + 'upload_date': '20230104', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ', + 'info_dict': { + 'id': 'GM8F313NQ', + 'ext': 'mp4', + 'title': 'Garakowa -Restore the World-', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'duration': 3996.104, + 'age_limit': 13, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', + 'info_dict': { + 'id': 'G62PEZ2E6', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'age_limit': 13, + 'duration': 65.138, + 'title': 'Garakowa -Restore the World-', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'only_matching': True, + }, { + 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', + 'only_matching': True, + }] + # We want to support lazy playlist filtering and movie listings cannot be inside a playlist + _RETURN_TYPE = 'video' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + + # We need to use unsigned API call to allow ratings query string + response = traverse_obj(self._call_api( + f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + object_type = response.get('type') + if object_type == 'episode': + result = self._transform_episode_response(response) + + elif object_type == 'movie': + result = self._transform_movie_response(response) + + elif object_type == 'movie_listing': + first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id')) + if not self._yes_playlist(internal_id, first_movie_id): + return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id) + + def entries(): + movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list') + for movie_response in traverse_obj(movies, ('data', ...)): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}', + CrunchyrollBetaIE, **self._transform_movie_response(movie_response)) + + return self.playlist_result(entries(), **self._transform_movie_response(response)) + + else: + raise ExtractorError(f'Unknown object type {object_type}') + + # There might be multiple audio languages for one object (`_metadata.versions`), + # so we need to get the id from `streams_link` instead or we dont know which language to choose + streams_link = response.get('streams_link') + if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + message = f'This {object_type} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + # We need go from unsigned to signed api to avoid getting soft banned + stream_response = self._call_cms_api_signed(remove_start( + streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + result['subtitles'] = self._extract_subtitles(stream_response) + # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json(f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - display_id, fatal=False, errnote=False) + intro_chapter = self._download_json( + f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) if isinstance(intro_chapter, dict): - chapters = [{ + result['chapters'] = [{ 'title': 'Intro', 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')) + 'end_time': float_or_none(intro_chapter.get('endTime')), }] + def calculate_count(item): + return parse_count(''.join((item['displayed'], item.get('unit') or ''))) + + result.update(traverse_obj(response, ('rating', { + 'like_count': ('up', {calculate_count}), + 'dislike_count': ('down', {calculate_count}), + }))) + + return result + + @staticmethod + def _transform_episode_response(data): + metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {} return { - 'id': internal_id, - 'title': '%s Episode %s – %s' % ( - episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'timestamp': parse_iso8601(episode_response.get('upload_date')), - 'series': episode_response.get('series_title'), - 'series_id': episode_response.get('series_id'), - 'season': episode_response.get('season_title'), - 'season_id': episode_response.get('season_id'), - 'season_number': episode_response.get('season_number'), - 'episode': episode_response.get('title'), - 'episode_number': episode_response.get('sequence_number'), - 'formats': formats, - 'thumbnails': [{ - 'url': thumb.get('source'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], - 'subtitles': { - lang: [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] for lang, subtitle_data in get_streams('subtitles') - }, - 'chapters': chapters + 'id': data['id'], + 'title': ' \u2013 '.join(( + ('%s%s' % ( + format_field(metadata, 'season_title'), + format_field(metadata, 'episode', ' Episode %s'))), + format_field(data, 'title'))), + **traverse_obj(data, { + 'episode': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', ({int}, {float_or_none})), + 'episode_number': ('sequence_number', ({int}, {float_or_none})), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'language': ('audio_locale', {str}), + }, get_all=False), + } + + @staticmethod + def _transform_movie_response(data): + metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {} + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P(?:\w{2}(?:-\w{2})?/)?) - series/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' + series/(?P\w+)''' _TESTS = [{ 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', + 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750', + # XXX: `thumbnail` does not get set from `thumbnails` in playlist + # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, }, 'playlist_mincount': 10, }, { @@ -279,41 +459,163 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) - - series_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, - note='Retrieving series metadata', query=params) - - seasons_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, - note='Retrieving season list', query=params) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') def entries(): - for season in seasons_response['items']: - episodes_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, - note=f'Retrieving episode list for {season.get("slug_title")}', query=params) - for episode in episodes_response['items']: - episode_id = episode['id'] - episode_display_id = episode['slug_title'] - yield { - '_type': 'url', - 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', - 'ie_key': CrunchyrollBetaIE.ie_key(), - 'id': episode_id, - 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), - 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode.get('duration_ms'), 1000), - 'series': episode.get('series_title'), - 'series_id': episode.get('series_id'), - 'season': episode.get('season_title'), - 'season_id': episode.get('season_id'), - 'season_number': episode.get('season_number'), - 'episode': episode.get('title'), - 'episode_number': episode.get('sequence_number'), - 'language': episode.get('audio_locale'), - } + seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons') + for season in traverse_obj(seasons_response, ('items', ..., {dict})): + episodes_response = self._call_cms_api_signed( + f'episodes?season_id={season["id"]}', season["id"], lang, 'episode list') + for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}', + CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response)) - return self.playlist_result(entries(), internal_id, series_response.get('title')) + return self.playlist_result( + entries(), internal_id, + **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, { + 'title': ('title', {str}), + 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'thumbnails': ('images', ..., ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }) + }))) + + +class CrunchyrollMusicIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:music' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + watch/(?Pconcert|musicvideo)/(?P\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV88BB7F2C', + 'display_id': 'crossing-field', + 'title': 'Crossing Field', + 'track': 'Crossing Field', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['Anime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MC2E2AC135', + 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', + 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'description': 'md5:747444e7e6300907b7a43f0a0503072e', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type') + path, name = { + 'concert': ('concerts', 'concert info'), + 'musicvideo': ('music_videos', 'music video info'), + }[object_type] + response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + streams_link = response.get('streams_link') + if not streams_link and response.get('isPremiumOnly'): + message = f'This {response.get("type") or "media"} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + result = self._transform_music_response(response) + stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + + return result + + @staticmethod + def _transform_music_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'display_id': 'slug', + 'title': 'title', + 'track': 'title', + 'artist': ('artist', 'name'), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } + + +class CrunchyrollArtistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:artist' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + artist/(?P\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D', + 'info_dict': { + 'id': 'MA179CB50D', + 'title': 'LiSA', + 'genre': ['J-Pop', 'Anime', 'Rock'], + 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', + }, + 'playlist_mincount': 83, + }, { + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + response = traverse_obj(self._call_api( + f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0)) + + def entries(): + for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]: + for internal_id in traverse_obj(response, (attribute, ...)): + yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id) + + return self.playlist_result(entries(), **self._transform_artist_response(response)) + + @staticmethod + def _transform_artist_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': 'name', + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + }), + } From edbe5b589dd0860a67b4e03f58db3cd2539d91c2 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Thu, 25 May 2023 22:52:44 +0200 Subject: [PATCH 24/75] Bugfixes for 4823ec9f461512daa1b8ab362893bb86a6320b26 Hotfix for fragmented downloads Authored by: bashonly --- yt_dlp/downloader/fragment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 6770815abb..53b4b604e7 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -284,7 +284,7 @@ def frag_progress_hook(s): frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) if not ctx['live']: state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes @@ -304,7 +304,7 @@ def _finish_frag_download(self, ctx, info_dict): to_file = ctx['tmpfilename'] != '-' if to_file: - downloaded_bytes = self.filesize_or_none(ctx['filename']) + downloaded_bytes = self.filesize_or_none(ctx['tmpfilename']) else: downloaded_bytes = ctx['complete_frags_downloaded_bytes'] From 4ad58667c102bd82a7c4cca8aa395ec1682e3b4c Mon Sep 17 00:00:00 2001 From: MMM Date: Thu, 25 May 2023 23:06:58 +0200 Subject: [PATCH 25/75] [extractor/bibeltv] Fix extraction, support live streams and series (#6505) Authored by: flashdagger --- yt_dlp/extractor/_extractors.py | 6 +- yt_dlp/extractor/bibeltv.py | 208 +++++++++++++++++++++++++++++--- 2 files changed, 194 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8984d4b167..6a1406dc5c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -204,7 +204,11 @@ BFMTVLiveIE, BFMTVArticleIE, ) -from .bibeltv import BibelTVIE +from .bibeltv import ( + BibelTVLiveIE, + BibelTVSeriesIE, + BibelTVVideoIE, +) from .bigflix import BigflixIE from .bigo import BigoIE from .bild import BildIE diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index fd20aadad4..34464daa1a 100644 --- a/yt_dlp/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py @@ -1,27 +1,197 @@ +from functools import partial + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + js_to_json, + orderedSet, + parse_iso8601, + traverse_obj, + url_or_none, +) -class BibelTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P\d+)' - _TESTS = [{ - 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', - 'md5': '252f908192d611de038b8504b08bf97f', - 'info_dict': { - 'id': 'ref:329703', - 'ext': 'mp4', - 'title': 'Sprachkurs in Malaiisch', - 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', - 'timestamp': 1608316701, - 'uploader_id': '5840105145001', - 'upload_date': '20201218', +class BibelTVBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['AT', 'CH', 'DE'] + _GEO_BYPASS = False + + API_URL = 'https://www.bibeltv.de/mediathek/api' + AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm' + + def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False): + formats = [] + subtitles = {} + for media_url in traverse_obj(data, (..., 'src', {url_or_none})): + media_ext = determine_ext(media_url) + if media_ext == 'm3u8': + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + media_url, crn_id, live=is_live) + formats.extend(m3u8_formats) + subtitles.update(m3u8_subs) + elif media_ext == 'mpd': + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id) + formats.extend(mpd_formats) + subtitles.update(mpd_subs) + elif media_ext == 'mp4': + formats.append({'url': media_url}) + else: + self.report_warning(f'Unknown format {media_ext!r}') + + return formats, subtitles + + @staticmethod + def _extract_base_info(data): + return { + 'id': data['crn'], + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'timestamp': ('schedulingStart', {parse_iso8601}), + 'season_number': 'seasonNumber', + 'episode_number': 'episodeNumber', + 'view_count': 'viewCount', + 'like_count': 'likeCount', + }), + 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., { + 'url': ('url', {url_or_none}), + }))), } - }, { - 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', - 'only_matching': True, + + def _extract_url_info(self, data): + return { + '_type': 'url', + 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'), + **self._extract_base_info(data), + } + + def _extract_video_info(self, data): + crn_id = data['crn'] + + if data.get('drm'): + self.report_drm(crn_id) + + json_data = self._download_json( + format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id, + headers={'Authorization': self.AUTH_TOKEN}, fatal=False, + errnote='No formats available') or {} + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id) + + return { + '_type': 'video', + **self._extract_base_info(data), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BibelTVVideoIE(BibelTVBaseIE): + IE_DESC = 'BibelTV single video' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P\d+)[\w-]+' + IE_NAME = 'bibeltv:video' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege', + 'md5': 'ec1c07efe54353780512e8a4103b612e', + 'info_dict': { + 'id': '344436', + 'ext': 'mp4', + 'title': 'Alte Wege', + 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9', + 'timestamp': 1677877071, + 'duration': 150.0, + 'upload_date': '20230303', + 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg', + 'episode': 'Episode 1', + 'episode_number': 1, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'format': '6', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): crn_id = self._match_id(url) - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') + video_data = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id), + ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict})) + if not video_data: + raise ExtractorError('Missing video data.') + + return self._extract_video_info(video_data) + + +class BibelTVSeriesIE(BibelTVBaseIE): + IE_DESC = 'BibelTV series playlist' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P\d+)[\w-]+' + IE_NAME = 'bibeltv:series' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag', + 'playlist_mincount': 400, + 'info_dict': { + 'id': '333485', + 'title': 'Ein Wunder für jeden Tag', + 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + webpage = self._download_webpage(url, crn_id) + nextjs_data = self._search_nextjs_data(webpage, crn_id) + series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict})) + if not series_data: + raise ExtractorError('Missing series data.') + + return self.playlist_result( + traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})), + crn_id, series_data.get('title'), clean_html(series_data.get('description'))) + + +class BibelTVLiveIE(BibelTVBaseIE): + IE_DESC = 'BibelTV live program' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P[\w-]+)' + IE_NAME = 'bibeltv:live' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/livestreams/bibeltv/', + 'info_dict': { + 'id': 'bibeltv', + 'ext': 'mp4', + 'title': 're:Bibel TV', + 'live_status': 'is_live', + 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bibeltv.de/livestreams/impuls/', + 'only_matching': True, + }] + + def _real_extract(self, url): + stream_id = self._match_id(url) + webpage = self._download_webpage(url, stream_id) + stream_data = self._search_json( + r'\\"video\\":', webpage, 'bibeltvData', stream_id, + transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"'))) + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True) + + return { + 'id': stream_id, + 'title': stream_data.get('title'), + 'thumbnail': stream_data.get('poster'), + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + } From 5caf30dbc34f10b0be60676fece635b5c59f0d72 Mon Sep 17 00:00:00 2001 From: Audrey <45548254+tntmod54321@users.noreply.github.com> Date: Fri, 26 May 2023 08:24:39 -0400 Subject: [PATCH 26/75] [extractor/youtube] Extract `heatmap` data (#7100) Closes #3888 Authored by: tntmod54321 --- yt_dlp/extractor/common.py | 4 ++++ yt_dlp/extractor/youtube.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 78288f8091..1b1dd560fd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -350,6 +350,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 654bf5e6b6..80edcd77da 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1273,6 +1273,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', } }, { @@ -1426,6 +1427,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'FlyingKitty', 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', 'uploader_id': '@FlyingKitty900', + 'comment_count': int, }, }, { @@ -3244,6 +3246,17 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) + def _extract_heatmap_from_player_overlay(self, data): + content_list = traverse_obj(data, ( + 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', + 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) + return next(filter(None, ( + traverse_obj(contents, (..., 'heatMarkerRenderer', { + 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, + 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), + })) for contents in content_list)), None) + def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: @@ -4313,6 +4326,8 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) + info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), expected_type=list, default=[]) From b844a3f8b16500663e7ab6c6ec061cc9b30f71ac Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 26 May 2023 07:57:10 -0500 Subject: [PATCH 27/75] [extractor/weverse] Add extractors (#6711) Closes #4786 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 8 + yt_dlp/extractor/naver.py | 2 +- yt_dlp/extractor/weverse.py | 604 ++++++++++++++++++++++++++++++++ 3 files changed, 613 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/weverse.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6a1406dc5c..49dd9aecd5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2320,6 +2320,14 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .weverse import ( + WeverseIE, + WeverseMediaIE, + WeverseMomentIE, + WeverseLiveTabIE, + WeverseMediaTabIE, + WeverseLiveIE, +) from .wevidi import WeVidiIE from .whyp import WhypIE from .wikimedia import WikimediaIE diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 7a1890a618..d79caf5f3d 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -21,7 +21,7 @@ class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE + @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)): diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py new file mode 100644 index 0000000000..ab629c885c --- /dev/null +++ b/yt_dlp/extractor/weverse.py @@ -0,0 +1,604 @@ +import base64 +import hashlib +import hmac +import itertools +import json +import re +import time +import urllib.error +import urllib.parse +import uuid + +from .common import InfoExtractor +from .naver import NaverBaseIE +from .youtube import YoutubeIE +from ..utils import ( + ExtractorError, + UserNotLive, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, + url_or_none, +) + + +class WeverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'weverse' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _API_HEADERS = { + 'Referer': 'https://weverse.io/', + 'WEV-device-Id': str(uuid.uuid4()), + } + + def _perform_login(self, username, password): + headers = { + 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', + 'x-acc-app-version': '2.2.6', + 'x-acc-language': 'en', + 'x-acc-service-id': 'weverse', + 'x-acc-trace-id': str(uuid.uuid4()), + 'x-clog-user-device-id': str(uuid.uuid4()), + } + check_username = self._download_json( + f'{self._ACCOUNT_API_BASE}/signup/email/status', None, + note='Checking username', query={'email': username}, headers=headers) + if not check_username.get('hasPassword'): + raise ExtractorError('Invalid username provided', expected=True) + + headers['content-type'] = 'application/json' + try: + auth = self._download_json( + f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + 'email': username, + 'password': password, + }, separators=(',', ':')).encode(), headers=headers, note='Logging in') + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + raise ExtractorError('Invalid password provided', expected=True) + raise + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {auth["accessToken"]}' + + def _real_initialize(self): + if self._API_HEADERS.get('Authorization'): + return + + token = try_call(lambda: self._get_cookies('https://weverse.io/')['we2_access_token'].value) + if not token: + self.raise_login_required() + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}' + + def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): + # Ref: https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/2488.a09b41ff.chunk.js + # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: + key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' + api_path = update_url_query(ep, { + 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', + 'language': 'en', + 'platform': 'WEB', + 'wpf': 'pc', + }) + wmsgpad = int(time.time() * 1000) + wmd = base64.b64encode(hmac.HMAC( + key, f'{api_path[:255]}{wmsgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + headers = {'Content-Type': 'application/json'} if data else {} + try: + return self._download_json( + f'https://global.apis.naver.com/weverse/wevweb{api_path}', video_id, note=note, + data=data, headers={**self._API_HEADERS, **headers}, query={ + 'wmsgpad': wmsgpad, + 'wmd': wmd, + }) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + self.raise_login_required( + 'Session token has expired. Log in again or refresh cookies in browser') + elif isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + raise ExtractorError('Your account does not have access to this content', expected=True) + raise + + def _call_post_api(self, video_id): + return self._call_api(f'/post/v1.0/post-{video_id}?fieldSet=postV1', video_id) + + def _get_community_id(self, channel): + return str(self._call_api( + f'/community/v1.0/communityIdUrlPathByUrlPathArtistCode?keyword={channel}', + channel, note='Fetching community ID')['communityId']) + + def _get_formats(self, data, video_id): + formats = traverse_obj(data, ('videos', 'list', lambda _, v: url_or_none(v['source']), { + 'url': 'source', + 'width': ('encodingOption', 'width', {int_or_none}), + 'height': ('encodingOption', 'height', {int_or_none}), + 'vcodec': 'type', + 'vbr': ('bitrate', 'video', {int_or_none}), + 'abr': ('bitrate', 'audio', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('encodingOption', 'id', {str_or_none}), + })) + + for stream in traverse_obj(data, ('streams', lambda _, v: v['type'] == 'HLS' and url_or_none(v['source']))): + query = {} + for param in traverse_obj(stream, ('keys', lambda _, v: v['type'] == 'param' and v['name'])): + query[param['name']] = param.get('value', '') + fmts = self._extract_m3u8_formats( + stream['source'], video_id, 'mp4', m3u8_id='hls', fatal=False, query=query) + if query: + for fmt in fmts: + fmt['url'] = update_url_query(fmt['url'], query) + fmt['extra_param_to_segment_url'] = urllib.parse.urlencode(query) + formats.extend(fmts) + + return formats + + def _get_subs(self, caption_url): + subs_ext_re = r'\.(?:ttml|vtt)' + replace_ext = lambda x, y: re.sub(subs_ext_re, y, x) + if re.search(subs_ext_re, caption_url): + return [replace_ext(caption_url, '.ttml'), replace_ext(caption_url, '.vtt')] + return [caption_url] + + def _parse_post_meta(self, metadata): + return traverse_obj(metadata, { + 'title': ((('extension', 'mediaInfo', 'title'), 'title'), {str}), + 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'uploader_id': ('author', 'memberId', {str}), + 'creator': ('community', 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'video', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'release_timestamp': ('extension', 'video', 'onAirStartAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', (('mediaInfo', 'thumbnail', 'url'), ('video', 'thumb')), {url_or_none}), + 'view_count': ('extension', 'video', 'playCount', {int_or_none}), + 'like_count': ('extension', 'video', 'likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False) + + def _extract_availability(self, data): + return self._availability(**traverse_obj(data, ((('extension', 'video'), None), { + 'needs_premium': 'paid', + 'needs_subscription': 'membershipOnly', + }), get_all=False, expected_type=bool), needs_auth=True) + + def _extract_live_status(self, data): + data = traverse_obj(data, ('extension', 'video', {dict})) or {} + if data.get('type') == 'LIVE': + return traverse_obj({ + 'ONAIR': 'is_live', + 'DONE': 'post_live', + 'STANDBY': 'is_upcoming', + 'DELAY': 'is_upcoming', + }, (data.get('status'), {str})) or 'is_live' + return 'was_live' if data.get('liveToVod') else 'not_live' + + +class WeverseIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/(?P[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/0-107323480', + 'md5': '1fa849f00181eef9100d3c8254c47979', + 'info_dict': { + 'id': '0-107323480', + 'ext': 'mp4', + 'title': '행복한 평이루💜', + 'description': '', + 'uploader': 'Billlie', + 'uploader_id': '5ae14aed7b7cdc65fa87c41fe06cc936', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1666262062, + 'upload_date': '20221020', + 'release_timestamp': 1666262058, + 'release_date': '20221020', + 'duration': 3102, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://weverse.io/lesserafim/live/2-102331763', + 'md5': 'e46125c08b13a6c8c1f4565035cca987', + 'info_dict': { + 'id': '2-102331763', + 'ext': 'mp4', + 'title': '🎂김채원 생신🎂', + 'description': '🎂김채원 생신🎂', + 'uploader': 'LE SSERAFIM ', + 'uploader_id': 'd26ddc1e258488a0a2b795218d14d59d', + 'channel': 'lesserafim', + 'channel_id': '47', + 'channel_url': 'https://weverse.io/lesserafim', + 'creator': 'LE SSERAFIM', + 'timestamp': 1659353400, + 'upload_date': '20220801', + 'release_timestamp': 1659353400, + 'release_date': '20220801', + 'duration': 3006, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + 'subtitles': { + 'id_ID': 'count:2', + 'en_US': 'count:2', + 'es_ES': 'count:2', + 'vi_VN': 'count:2', + 'th_TH': 'count:2', + 'zh_CN': 'count:2', + 'zh_TW': 'count:2', + 'ja_JP': 'count:2', + 'ko_KR': 'count:2', + }, + }, + }, { + 'url': 'https://weverse.io/treasure/live/2-117230416', + 'info_dict': { + 'id': '2-117230416', + 'ext': 'mp4', + 'title': r're:스껄도려님 첫 스무살 생파🦋', + 'description': '', + 'uploader': 'TREASURE', + 'uploader_id': '77eabbc449ca37f7970054a136f60082', + 'channel': 'treasure', + 'channel_id': '20', + 'channel_url': 'https://weverse.io/treasure', + 'creator': 'TREASURE', + 'timestamp': 1680667651, + 'upload_date': '20230405', + 'release_timestamp': 1680667639, + 'release_date': '20230405', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['video']['videoId'] + availability = self._extract_availability(post) + live_status = self._extract_live_status(post) + video_info, formats = {}, [] + + if live_status == 'is_upcoming': + self.raise_no_formats('Livestream has not yet started', expected=True) + + elif live_status == 'is_live': + video_info = self._call_api( + f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + video_id, note='Downloading live JSON') + playback = self._parse_json(video_info['lipPlayback'], video_id) + m3u8_url = traverse_obj(playback, ( + 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + + elif live_status == 'post_live': + if availability in ('premium_only', 'subscriber_only'): + self.report_drm(video_id) + self.raise_no_formats( + 'Livestream has ended and downloadable VOD is not available', expected=True) + + else: + infra_video_id = post['extension']['video']['infraVideoId'] + in_key = self._call_api( + f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + data=b'{}', note='Downloading VOD API key')['inKey'] + + video_info = self._download_json( + f'https://global.apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{infra_video_id}', + video_id, note='Downloading VOD JSON', query={ + 'key': in_key, + 'sid': traverse_obj(post, ('extension', 'video', 'serviceId')) or '2070', + 'pid': str(uuid.uuid4()), + 'nonce': int(time.time() * 1000), + 'devt': 'html5_pc', + 'prv': 'Y' if post.get('membershipOnly') else 'N', + 'aup': 'N', + 'stpb': 'N', + 'cpl': 'en', + 'env': 'prod', + 'lc': 'en', + 'adi': '[{"adSystem":"null"}]', + 'adu': '/', + }) + + formats = self._get_formats(video_info, video_id) + has_drm = traverse_obj(video_info, ('meta', 'provider', 'name', {str.lower})) == 'drm' + if has_drm and formats: + self.report_warning( + 'Requested content is DRM-protected, only a 30-second preview is available', video_id) + elif has_drm and not formats: + self.report_drm(video_id) + + return { + 'id': video_id, + 'channel': channel, + 'channel_url': f'https://weverse.io/{channel}', + 'formats': formats, + 'availability': availability, + 'live_status': live_status, + **self._parse_post_meta(post), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseMediaIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media/(?P[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/4-116372884', + 'md5': '8efc9cfd61b2f25209eb1a5326314d28', + 'info_dict': { + 'id': 'e-C9wLSQs6o', + 'ext': 'mp4', + 'title': 'Billlie | \'EUNOIA\' Performance Video (heartbeat ver.)', + 'description': 'md5:6181caaf2a2397bca913ffe368c104e5', + 'channel': 'Billlie', + 'channel_id': 'UCyc9sUCxELTDK9vELO5Fzeg', + 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', + 'uploader': 'Billlie', + 'uploader_id': '@Billlie', + 'uploader_url': 'http://www.youtube.com/@Billlie', + 'upload_date': '20230403', + 'duration': 211, + 'age_limit': 0, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'availability': 'public', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': 'count:7', + }, + }, { + 'url': 'https://weverse.io/billlie/media/3-102914520', + 'md5': '031551fcbd716bc4f080cb6174a43d8a', + 'info_dict': { + 'id': '3-102914520', + 'ext': 'mp4', + 'title': 'From. SUHYEON🌸', + 'description': 'Billlie 멤버별 독점 영상 공개💙💜', + 'uploader': 'Billlie_official', + 'uploader_id': 'f569c6e92f7eaffef0a395037dcaa54f', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1662174000, + 'upload_date': '20220903', + 'release_timestamp': 1662174000, + 'release_date': '20220903', + 'duration': 17.0, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'not_live', + }, + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + media_type = traverse_obj(post, ('extension', 'mediaInfo', 'mediaType', {str.lower})) + youtube_id = traverse_obj(post, ('extension', 'youtube', 'youtubeVideoId', {str})) + + if media_type == 'vod': + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) + elif media_type == 'youtube' and youtube_id: + return self.url_result(youtube_id, YoutubeIE) + elif media_type == 'image': + self.raise_no_formats('No video content found in webpage', expected=True) + elif media_type: + raise ExtractorError(f'Unsupported media type "{media_type}"') + + self.raise_no_formats('No video content found in webpage') + + +class WeverseMomentIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', + 'md5': '87733ac19a54081b7dfc2442036d282b', + 'info_dict': { + 'id': '1-117229444', + 'ext': 'mp4', + 'title': '今日もめっちゃいい天気☀️🌤️', + 'uploader': '레아', + 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', + 'channel': 'secretnumber', + 'channel_id': '56', + 'creator': 'SECRET NUMBER', + 'duration': 10, + 'upload_date': '20230405', + 'timestamp': 1680653968, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + }, + 'skip': 'Moment has expired', + }] + + def _real_extract(self, url): + channel, uploader_id, video_id = self._match_valid_url(url).group('artist', 'uid', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['moment']['video']['videoId'] + video_info = self._call_api( + f'/cvideo/v1.0/cvideo-{api_video_id}/playInfo?videoId={api_video_id}', video_id, + note='Downloading moment JSON')['playInfo'] + + return { + 'id': video_id, + 'channel': channel, + 'uploader_id': uploader_id, + 'formats': self._get_formats(video_info, video_id), + 'availability': self._extract_availability(post), + **traverse_obj(post, { + 'title': ((('extension', 'moment', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'creator': (('community', 'author'), 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'moment', 'video', 'uploadInfo', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', 'moment', 'video', 'uploadInfo', 'imageUrl', {url_or_none}), + 'like_count': ('emotionCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseTabBaseIE(WeverseBaseIE): + _ENDPOINT = None + _PATH = None + _QUERY = {} + _RESULT_IE = None + + def _entries(self, channel_id, channel, first_page): + query = self._QUERY.copy() + + for page in itertools.count(1): + posts = first_page if page == 1 else self._call_api( + update_url_query(self._ENDPOINT % channel_id, query), channel, + note=f'Downloading {self._PATH} tab page {page}') + + for post in traverse_obj(posts, ('data', lambda _, v: v['postId'])): + yield self.url_result( + f'https://weverse.io/{channel}/{self._PATH}/{post["postId"]}', + self._RESULT_IE, post['postId'], **self._parse_post_meta(post), + channel=channel, channel_url=f'https://weverse.io/{channel}', + availability=self._extract_availability(post), + live_status=self._extract_live_status(post)) + + query['after'] = traverse_obj(posts, ('paging', 'nextParams', 'after', {str})) + if not query['after']: + break + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + first_page = self._call_api( + update_url_query(self._ENDPOINT % channel_id, self._QUERY), channel, + note=f'Downloading {self._PATH} tab page 1') + + return self.playlist_result( + self._entries(channel_id, channel, first_page), f'{channel}-{self._PATH}', + **traverse_obj(first_page, ('data', ..., { + 'playlist_title': ('community', 'communityName', {str}), + 'thumbnail': ('author', 'profileImageUrl', {url_or_none}), + }), get_all=False)) + + +class WeverseLiveTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/', + 'playlist_mincount': 55, + 'info_dict': { + 'id': 'billlie-live', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }] + + _ENDPOINT = '/post/v1.0/community-%s/liveTabPosts' + _PATH = 'live' + _QUERY = {'fieldSet': 'postsV1'} + _RESULT_IE = WeverseIE + + +class WeverseMediaTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/', + 'playlist_mincount': 231, + 'info_dict': { + 'id': 'billlie-media', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }, { + 'url': 'https://weverse.io/lesserafim/media/all', + 'only_matching': True, + }, { + 'url': 'https://weverse.io/lesserafim/media/new', + 'only_matching': True, + }] + + _ENDPOINT = '/media/v1.0/community-%s/more' + _PATH = 'media' + _QUERY = {'fieldSet': 'postsV1', 'filterType': 'RECENT'} + _RESULT_IE = WeverseMediaIE + + +class WeverseLiveIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/purplekiss', + 'info_dict': { + 'id': '3-116560493', + 'ext': 'mp4', + 'title': r're:모하냥🫶🏻', + 'description': '내일은 금요일~><', + 'uploader': '채인', + 'uploader_id': '1ffb1d9d904d6b3db2783f876eb9229d', + 'channel': 'purplekiss', + 'channel_id': '35', + 'channel_url': 'https://weverse.io/purplekiss', + 'creator': 'PURPLE KISS', + 'timestamp': 1680780892, + 'upload_date': '20230406', + 'release_timestamp': 1680780883, + 'release_date': '20230406', + 'thumbnail': 'https://weverse-live.pstatic.net/v1.0/live/62044/thumb', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/billlie/', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + video_id = traverse_obj( + self._call_api(update_url_query(f'/post/v1.0/community-{channel_id}/liveTab', { + 'debugMessage': 'true', + 'fields': 'onAirLivePosts.fieldSet(postsV1).limit(10),reservedLivePosts.fieldSet(postsV1).limit(10)', + }), channel, note='Downloading live JSON'), ( + ('onAirLivePosts', 'reservedLivePosts'), 'data', + lambda _, v: self._extract_live_status(v) in ('is_live', 'is_upcoming'), 'postId', {str}), + get_all=False) + + if not video_id: + raise UserNotLive(video_id=channel) + + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) From 66468bbf49562ff82670cbbd456c5e8448a6df34 Mon Sep 17 00:00:00 2001 From: sqrtNOT <77981959+sqrtNOT@users.noreply.github.com> Date: Fri, 26 May 2023 13:03:19 +0000 Subject: [PATCH 28/75] [extractor/comedycentral] Add support for movies (#7108) Closes #1926 Authored by: sqrtNOT --- yt_dlp/extractor/comedycentral.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/comedycentral.py b/yt_dlp/extractor/comedycentral.py index 05fc9f2b50..27d295bb38 100644 --- a/yt_dlp/extractor/comedycentral.py +++ b/yt_dlp/extractor/comedycentral.py @@ -2,7 +2,7 @@ class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas', + 'only_matching': True, }] From 08916a49c777cb6e000eec092881eb93ec22076c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 27 May 2023 19:06:13 +1200 Subject: [PATCH 29/75] [core] Improve HTTP redirect handling (#7094) Aligns HTTP redirect handling with what browsers commonly do and RFC standards. Fixes issues https://github.com/yt-dlp/yt-dlp/commit/afac4caa7db30804bebac33e53c3cb0237958224 missed. Authored by: coletdjnz --- test/test_YoutubeDL.py | 6 - test/test_http.py | 288 +++++++++++++++++++++++++++++++++++++---- yt_dlp/utils/_utils.py | 59 +++------ 3 files changed, 281 insertions(+), 72 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 477fd220ef..ee6c527135 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -10,7 +10,6 @@ import copy import json -import urllib.error from test.helper import FakeYDL, assertRegexpMatches from yt_dlp import YoutubeDL @@ -1097,11 +1096,6 @@ def test_selection(params, expected_ids, evaluate_all=False): test_selection({'playlist_items': '-15::2'}, INDICES[1::2], True) test_selection({'playlist_items': '-15::15'}, [], True) - def test_urlopen_no_file_protocol(self): - # see https://github.com/ytdl-org/youtube-dl/issues/8227 - ydl = YDL() - self.assertRaises(urllib.error.URLError, ydl.urlopen, 'file:///etc/passwd') - def test_do_not_override_ie_key_in_url_transparent(self): ydl = YDL() diff --git a/test/test_http.py b/test/test_http.py index 5ca0d7a470..d684905da5 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -7,40 +7,163 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - +import gzip +import http.cookiejar import http.server +import io +import pathlib import ssl +import tempfile import threading +import urllib.error import urllib.request from test.helper import http_server_port from yt_dlp import YoutubeDL +from yt_dlp.utils import sanitized_Request, urlencode_postdata + +from .helper import FakeYDL TEST_DIR = os.path.dirname(os.path.abspath(__file__)) class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + def log_message(self, format, *args): pass + def _headers(self): + payload = str(self.headers).encode('utf-8') + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def _redirect(self): + self.send_response(int(self.path[len('/redirect_'):])) + self.send_header('Location', '/method') + self.send_header('Content-Length', '0') + self.end_headers() + + def _method(self, method, payload=None): + self.send_response(200) + self.send_header('Content-Length', str(len(payload or ''))) + self.send_header('Method', method) + self.end_headers() + if payload: + self.wfile.write(payload) + + def _status(self, status): + payload = f'{status} NOT FOUND'.encode() + self.send_response(int(status)) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def _read_data(self): + if 'Content-Length' in self.headers: + return self.rfile.read(int(self.headers['Content-Length'])) + + def do_POST(self): + data = self._read_data() + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('POST', data) + elif self.path.startswith('/headers'): + self._headers() + else: + self._status(404) + + def do_HEAD(self): + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('HEAD') + else: + self._status(404) + + def do_PUT(self): + data = self._read_data() + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('PUT', data) + else: + self._status(404) + def do_GET(self): if self.path == '/video.html': + payload = b'