From c4853655cb9a793129280806af643de43c48f4d5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 11:07:15 -0500 Subject: [PATCH 01/15] [ie/wrestleuniverse] Avoid partial stream formats (#9800) Authored by: bashonly --- yt_dlp/extractor/wrestleuniverse.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 880ee519b..d401d6d39 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -12,6 +12,7 @@ jwt_decode_hs256, traverse_obj, try_call, + url_basename, url_or_none, urlencode_postdata, variadic, @@ -194,8 +195,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'formats': self._get_formats(video_data, ( - (('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id), + 'formats': self._get_formats(video_data, ('protocolHls', 'url', {url_or_none}), video_id), **traverse_obj(metadata, { 'title': ('displayName', {str}), 'description': ('description', {str}), @@ -259,6 +259,10 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE): 'params': { 'skip_download': 'm3u8', }, + }, { + 'note': 'manifest provides live-a (partial) and live-b (full) streams', + 'url': 'https://www.wrestle-universe.com/en/lives/umc99R9XsexXrxr9VjTo9g', + 'only_matching': True, }] _API_PATH = 'events' @@ -285,12 +289,16 @@ def _real_extract(self, url): video_data, decrypt = self._call_encrypted_api( video_id, ':watchArchive', 'watch archive', data={'method': 1}) - info['formats'] = self._get_formats(video_data, ( - ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) + # 'chromecastUrls' can be only partial videos, avoid + info['formats'] = self._get_formats(video_data, ('hls', (('urls', ...), 'url'), {url_or_none}), video_id) for f in info['formats']: # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values if f.get('tbr'): f['tbr'] = int(f['tbr'] / 2.5) + # prefer variants with the same basename as the master playlist to avoid partial streams + f['format_id'] = url_basename(f['url']).partition('.')[0] + if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]): + f['preference'] = -10 hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) if hls_aes_key: From 231c2eacc41b06b65c63edf94c0d04768a5da607 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 11:14:36 -0500 Subject: [PATCH 02/15] [ie/soundcloud] Extract `genres` (#9821) Authored by: bashonly --- yt_dlp/extractor/soundcloud.py | 50 ++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index c9ed645eb..c9ca41a5c 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -361,7 +361,7 @@ def extract_count(key): 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), + 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)), 'formats': formats if not extract_flat else None } @@ -395,10 +395,10 @@ class SoundcloudIE(SoundcloudBaseIE): _TESTS = [ { 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', - 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', + 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2', 'info_dict': { 'id': '62986583', - 'ext': 'mp3', + 'ext': 'opus', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', @@ -411,6 +411,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', + 'uploader_url': 'https://soundcloud.com/ethmusic', + 'genres': [], } }, # geo-restricted @@ -418,7 +421,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '47127627', - 'ext': 'mp3', + 'ext': 'opus', 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', @@ -431,6 +434,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/the-concept-band', + 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', + 'genres': ['Alternative'], }, }, # private link @@ -452,6 +458,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/jaimemf', + 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', + 'genres': ['youtubedl'], }, }, # private link (alt format) @@ -473,6 +482,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/jaimemf', + 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', + 'genres': ['youtubedl'], }, }, # downloadable song @@ -482,6 +494,21 @@ class SoundcloudIE(SoundcloudBaseIE): 'info_dict': { 'id': '343609555', 'ext': 'wav', + 'title': 'The Following', + 'description': '', + 'uploader': '80M', + 'uploader_id': '312384765', + 'uploader_url': 'https://soundcloud.com/the80m', + 'upload_date': '20170922', + 'timestamp': 1506120436, + 'duration': 397.228, + 'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg', + 'license': 'all-rights-reserved', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'view_count': int, + 'genres': ['Dance & EDM'], }, }, # private link, downloadable format @@ -503,6 +530,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg', + 'uploader_url': 'https://soundcloud.com/oriuplift', + 'genres': ['Trance'], }, }, # no album art, use avatar pic for thumbnail @@ -525,6 +555,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/garyvee', + 'genres': [], }, 'params': { 'skip_download': True, @@ -532,13 +564,13 @@ class SoundcloudIE(SoundcloudBaseIE): }, { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', - 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'md5': '8227c3473a4264df6b02ad7e5b7527ac', 'info_dict': { 'id': '583011102', - 'ext': 'mp3', + 'ext': 'opus', 'title': 'Mezzo Valzer', - 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Micronie', + 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a', + 'uploader': 'Giovanni Sarani', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', @@ -549,6 +581,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'genres': ['Piano'], + 'uploader_url': 'https://soundcloud.com/giovannisarani', }, }, { From cb2fb4a643949322adba561ca73bcba3221ec0c5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 11:15:44 -0500 Subject: [PATCH 03/15] [ie/crunchyroll] Always make metadata available (#9772) Closes #9750 Authored by: bashonly --- yt_dlp/extractor/crunchyroll.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 385a3c2d3..a157cddac 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -394,10 +394,11 @@ def entries(): if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): message = f'This {object_type} is for premium members only' if CrunchyrollBaseIE._REFRESH_TOKEN: - raise ExtractorError(message, expected=True) - self.raise_login_required(message, method='password') - - result['formats'], result['subtitles'] = self._extract_stream(internal_id) + self.raise_no_formats(message, expected=True, video_id=internal_id) + else: + self.raise_login_required(message, method='password', metadata_available=True) + else: + result['formats'], result['subtitles'] = self._extract_stream(internal_id) result['chapters'] = self._extract_chapters(internal_id) @@ -583,14 +584,16 @@ def _real_extract(self, url): if not response: raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + result = self._transform_music_response(response) + if not self._IS_PREMIUM and response.get('isPremiumOnly'): message = f'This {response.get("type") or "media"} is for premium members only' if CrunchyrollBaseIE._REFRESH_TOKEN: - raise ExtractorError(message, expected=True) - self.raise_login_required(message, method='password') - - result = self._transform_music_response(response) - result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) + self.raise_no_formats(message, expected=True, video_id=internal_id) + else: + self.raise_login_required(message, method='password', metadata_available=True) + else: + result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) return result From 036e0d92c6052465673d459678322ea03e61483d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 17:11:11 -0500 Subject: [PATCH 04/15] [ie/patreon] Extract multiple embeds (#9850) Closes #9848 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 134 ++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 51 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 9381c7eab..6c441ff34 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -219,7 +219,29 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': r're:^https?://.+', }, 'params': {'skip_download': 'm3u8'}, + }, { + # multiple attachments/embeds + 'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977', + 'playlist_count': 3, + 'info_dict': { + 'id': '100601977', + 'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis', + 'description': 'md5:d099ab976edfce6de2a65c2b169a88d3', + 'uploader': 'Bradley Hall', + 'uploader_id': '24401883', + 'uploader_url': 'https://www.patreon.com/bradleyhallguitar', + 'channel_id': '3193932', + 'channel_url': 'https://www.patreon.com/bradleyhallguitar', + 'channel_follower_count': int, + 'timestamp': 1710777855, + 'upload_date': '20240318', + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:^https?://.+', + }, + 'skip': 'Patron-only content', }] + _RETURN_TYPE = 'video' def _real_extract(self, url): video_id = self._match_id(url) @@ -234,58 +256,54 @@ def _real_extract(self, url): 'include': 'audio,user,user_defined_tags,campaign,attachments_media', }) attributes = post['data']['attributes'] - title = attributes['title'].strip() - image = attributes.get('image') or {} - info = { - 'id': video_id, - 'title': title, - 'description': clean_html(attributes.get('content')), - 'thumbnail': image.get('large_url') or image.get('url'), - 'timestamp': parse_iso8601(attributes.get('published_at')), - 'like_count': int_or_none(attributes.get('like_count')), - 'comment_count': int_or_none(attributes.get('comment_count')), - } - can_view_post = traverse_obj(attributes, 'current_user_can_view') - if can_view_post and info['comment_count']: - info['__post_extractor'] = self.extract_comments(video_id) + info = traverse_obj(attributes, { + 'title': ('title', {str.strip}), + 'description': ('content', {clean_html}), + 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any), + 'timestamp': ('published_at', {parse_iso8601}), + 'like_count': ('like_count', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + }) - for i in post.get('included', []): - i_type = i.get('type') - if i_type == 'media': - media_attributes = i.get('attributes') or {} - download_url = media_attributes.get('download_url') + entries = [] + idx = 0 + for include in traverse_obj(post, ('included', lambda _, v: v['type'])): + include_type = include['type'] + if include_type == 'media': + media_attributes = traverse_obj(include, ('attributes', {dict})) or {} + download_url = url_or_none(media_attributes.get('download_url')) ext = mimetype2ext(media_attributes.get('mimetype')) # if size_bytes is None, this media file is likely unavailable # See: https://github.com/yt-dlp/yt-dlp/issues/4608 size_bytes = int_or_none(media_attributes.get('size_bytes')) if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: - # XXX: what happens if there are multiple attachments? - return { - **info, + idx += 1 + entries.append({ + 'id': f'{video_id}-{idx}', 'ext': ext, 'filesize': size_bytes, 'url': download_url, - } - elif i_type == 'user': - user_attributes = i.get('attributes') - if user_attributes: - info.update({ - 'uploader': user_attributes.get('full_name'), - 'uploader_id': str_or_none(i.get('id')), - 'uploader_url': user_attributes.get('url'), }) - elif i_type == 'post_tag': - info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) + elif include_type == 'user': + info.update(traverse_obj(include, { + 'uploader': ('attributes', 'full_name', {str}), + 'uploader_id': ('id', {str_or_none}), + 'uploader_url': ('attributes', 'url', {url_or_none}), + })) - elif i_type == 'campaign': - info.update({ - 'channel': traverse_obj(i, ('attributes', 'title')), - 'channel_id': str_or_none(i.get('id')), - 'channel_url': traverse_obj(i, ('attributes', 'url')), - 'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), - }) + elif include_type == 'post_tag': + if post_tag := traverse_obj(include, ('attributes', 'value', {str})): + info.setdefault('tags', []).append(post_tag) + + elif include_type == 'campaign': + info.update(traverse_obj(include, { + 'channel': ('attributes', 'title', {str}), + 'channel_id': ('id', {str_or_none}), + 'channel_url': ('attributes', 'url', {url_or_none}), + 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), + })) # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': @@ -296,36 +314,50 @@ def _real_extract(self, url): v_url, video_id, 'Checking Vimeo embed URL', headers={'Referer': 'https://patreon.com/'}, fatal=False, errnote=False): - return self.url_result( + entries.append(self.url_result( VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), - VimeoIE, url_transparent=True, **info) + VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): - return self.url_result(embed_url, **info) + entries.append(self.url_result(embed_url)) - post_file = traverse_obj(attributes, 'post_file') + post_file = traverse_obj(attributes, ('post_file', {dict})) if post_file: name = post_file.get('name') ext = determine_ext(name) if ext in KNOWN_EXTENSIONS: - return { - **info, + entries.append({ + 'id': video_id, 'ext': ext, 'url': post_file['url'], - } + }) elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) - return { - **info, + entries.append({ + 'id': video_id, 'formats': formats, 'subtitles': subtitles, - } + }) - if can_view_post is False: + can_view_post = traverse_obj(attributes, 'current_user_can_view') + comments = None + if can_view_post and info.get('comment_count'): + comments = self.extract_comments(video_id) + + if not entries and can_view_post is False: self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) - else: + elif not entries: self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) + elif len(entries) == 1: + info.update(entries[0]) + else: + for entry in entries: + entry.update(info) + return self.playlist_result(entries, video_id, **info, __post_extractor=comments) + + info['id'] = video_id + info['__post_extractor'] = comments return info def _get_comments(self, post_id): From bec9a59e8ec82c18e3bf9268eaa436793dd52e35 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 17:19:42 -0500 Subject: [PATCH 05/15] [networking] Add `extensions` attribute to `Response` (#9756) CurlCFFIRH now provides an `impersonate` field in its responses' extensions Authored by: bashonly --- test/test_networking.py | 19 +++++++++++++++++++ yt_dlp/networking/_curlcffi.py | 10 ++++++++++ yt_dlp/networking/common.py | 6 +++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/test/test_networking.py b/test/test_networking.py index b50f70d08..d613cb568 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -785,6 +785,25 @@ def test_supported_impersonate_targets(self, handler): assert res.status == 200 assert std_headers['user-agent'].lower() not in res.read().decode().lower() + def test_response_extensions(self, handler): + with handler() as rh: + for target in rh.supported_targets: + request = Request( + f'http://127.0.0.1:{self.http_port}/gen_200', extensions={'impersonate': target}) + res = validate_and_send(rh, request) + assert res.extensions['impersonate'] == rh._get_request_target(request) + + def test_http_error_response_extensions(self, handler): + with handler() as rh: + for target in rh.supported_targets: + request = Request( + f'http://127.0.0.1:{self.http_port}/gen_404', extensions={'impersonate': target}) + try: + validate_and_send(rh, request) + except HTTPError as e: + res = e.response + assert res.extensions['impersonate'] == rh._get_request_target(request) + class TestRequestHandlerMisc: """Misc generic tests for request handlers, not related to request or validation testing""" diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index 39d1f70fb..10751a105 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -132,6 +132,16 @@ def _check_extensions(self, extensions): extensions.pop('cookiejar', None) extensions.pop('timeout', None) + def send(self, request: Request) -> Response: + target = self._get_request_target(request) + try: + response = super().send(request) + except HTTPError as e: + e.response.extensions['impersonate'] = target + raise + response.extensions['impersonate'] = target + return response + def _send(self, request: Request): max_redirects_exceeded = False session: curl_cffi.requests.Session = self._get_instance( diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 4c66ba66a..a2217034c 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -497,6 +497,7 @@ class Response(io.IOBase): @param headers: response headers. @param status: Response HTTP status code. Default is 200 OK. @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided. + @param extensions: Dictionary of handler-specific response extensions. """ def __init__( @@ -505,7 +506,9 @@ def __init__( url: str, headers: Mapping[str, str], status: int = 200, - reason: str = None): + reason: str = None, + extensions: dict = None + ): self.fp = fp self.headers = Message() @@ -517,6 +520,7 @@ def __init__( self.reason = reason or HTTPStatus(status).phrase except ValueError: self.reason = None + self.extensions = extensions or {} def readable(self): return self.fp.readable() From 96da9525043f78aca4544d01761b13b2140e9ae6 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Sun, 5 May 2024 00:44:08 +0200 Subject: [PATCH 06/15] [core] Warn if lack of ffmpeg alters format selection (#9805) Authored by: seproDev, pukkandan --- yt_dlp/YoutubeDL.py | 53 +++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9f730d038..e0d58f0f4 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2136,6 +2136,11 @@ def _filter(f): def _check_formats(self, formats): for f in formats: + working = f.get('__working') + if working is not None: + if working: + yield f + continue self.to_screen('[info] Testing format %s' % f['format_id']) path = self.get_output_path('temp') if not self._ensure_dir_exists(f'{path}/'): @@ -2152,33 +2157,44 @@ def _check_formats(self, formats): os.remove(temp_file.name) except OSError: self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + f['__working'] = success if success: yield f else: self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + def _select_formats(self, formats, selector): + return list(selector({ + 'formats': formats, + 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), + 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video + or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio + })) + def _default_format_spec(self, info_dict, download=True): + download = download and not self.params.get('simulate') + prefer_best = download and ( + self.params['outtmpl']['default'] == '-' + or info_dict.get('is_live') and not self.params.get('live_from_start')) def can_merge(): merger = FFmpegMergerPP(self) return merger.available and merger.can_merge() - prefer_best = ( - not self.params.get('simulate') - and download - and ( - not can_merge() - or info_dict.get('is_live') and not self.params.get('live_from_start') - or self.params['outtmpl']['default'] == '-')) - compat = ( - prefer_best - or self.params.get('allow_multiple_audio_streams', False) - or 'format-spec' in self.params['compat_opts']) + if not prefer_best and download and not can_merge(): + prefer_best = True + formats = self._get_formats(info_dict) + evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec)) + if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'): + self.report_warning('ffmpeg not found. The downloaded format may not be the best available. ' + 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies') - return ( - 'best/bestvideo+bestaudio' if prefer_best - else 'bestvideo*+bestaudio/best' if not compat - else 'bestvideo+bestaudio/best') + compat = (self.params.get('allow_multiple_audio_streams') + or 'format-spec' in self.params['compat_opts']) + + return ('best/bestvideo+bestaudio' if prefer_best + else 'bestvideo+bestaudio/best' if compat + else 'bestvideo*+bestaudio/best') def build_format_selector(self, format_spec): def syntax_error(note, start): @@ -2928,12 +2944,7 @@ def is_wellformed(f): self.write_debug(f'Default format spec: {req_format}') format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector({ - 'formats': formats, - 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), - 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video - or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio - })) + formats_to_download = self._select_formats(formats, format_selector) if interactive_format_selection and not formats_to_download: self.report_error('Requested format is not available', tb=False, is_error=False) continue From 351368cb9a6731b886a58f5a10fd6b302bbe47be Mon Sep 17 00:00:00 2001 From: The-MAGI <110553776+The-MAGI@users.noreply.github.com> Date: Mon, 6 May 2024 01:57:38 +0300 Subject: [PATCH 07/15] [ie/youporn] Fix extractor (#8827) Closes #7967 Authored by: The-MAGI --- yt_dlp/extractor/youporn.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 6ee0abcae..6d4e31bf3 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor): 'id': '16290308', 'age_limit': 18, 'categories': [], - 'description': 'md5:00ea70f642f431c379763c17c2f396bc', + 'description': str, # TODO: detect/remove SEO spam description in ytdl backport 'display_id': 'tinderspecial-trailer1', 'duration': 298.0, 'ext': 'mp4', 'upload_date': '20201123', 'uploader': 'Ersties', 'tags': [], - 'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg', - 'timestamp': 1606089600, + 'thumbnail': r're:https://.+\.jpg', + 'timestamp': 1606147564, 'title': 'Tinder In Real Life', 'view_count': int, } @@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor): def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') - definitions = self._download_json( - f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id) + self._set_cookie('.youporn.com', 'age_verified', '1') + webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id) + definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions'] - def get_format_data(data, f): - return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl'])) + def get_format_data(data, stream_type): + info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any)) + if not info_url: + return [] + return traverse_obj( + self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False), + lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl'])) formats = [] # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s @@ -123,10 +129,6 @@ def get_format_data(data, f): f['height'] = height formats.append(f) - webpage = self._download_webpage( - 'http://www.youporn.com/watch/%s' % video_id, display_id, - headers={'Cookie': 'age_verified=1'}) - title = self._html_search_regex( r'(?s)]+class=["\']watchVideoTitle[^>]+>(.+?)', webpage, 'title', default=None) or self._og_search_title( From c8bf48f3a8fa29587e7c73ef5a7710385a5ea725 Mon Sep 17 00:00:00 2001 From: Chris Caruso Date: Sun, 5 May 2024 16:02:24 -0700 Subject: [PATCH 08/15] [ie/cbc.ca:player] Improve `_VALID_URL` (#9866) Closes #9825 Authored by: carusocr --- yt_dlp/extractor/cbc.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index ff320dd68..a4180262b 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -151,7 +151,7 @@ def _real_extract(self, url): class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' - _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P(?:\d\.)?\d+)' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P(?:\d\.)?\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', @@ -277,6 +277,28 @@ class CBCPlayerIE(InfoExtractor): 'location': 'Canada', 'media_type': 'Full Program', }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/1.7194274', + 'md5': '188b96cf6bdcb2540e178a6caa957128', + 'info_dict': { + 'id': '2334524995812', + 'ext': 'mp4', + 'title': '#TheMoment a rare white spirit moose was spotted in Alberta', + 'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3', + 'timestamp': 1714788791, + 'duration': 77.678, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg', + 'uploader': 'CBCC-NEW', + 'chapters': 'count:0', + 'upload_date': '20240504', + 'categories': 'count:3', + 'series': 'The National', + 'tags': 'count:15', + 'creators': ['encoder'], + 'location': 'Canada', + 'media_type': 'Excerpt', + }, }, { 'url': 'cbcplayer:1.7159484', 'only_matching': True, From 5904853ae5788509fdc4892cb7ecdfa9ae7f78e6 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 5 May 2024 18:15:32 -0500 Subject: [PATCH 09/15] [ie/crunchyroll] Support browser impersonation (#9857) Closes #7442 Authored by: bashonly --- yt_dlp/extractor/crunchyroll.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index a157cddac..90967c160 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -53,15 +53,19 @@ def _set_auth_info(self, response): CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10) def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'): - try: # TODO: Add impersonation support here + try: return self._download_json( f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote, - headers=headers, data=urlencode_postdata(data)) + headers=headers, data=urlencode_postdata(data), impersonate=True) except ExtractorError as error: if not isinstance(error.cause, HTTPError) or error.cause.status != 403: raise + if target := error.cause.response.extensions.get('impersonate'): + raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"') raise ExtractorError( - 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'Request blocked by Cloudflare. ' + 'Install the required impersonation dependency if possible, ' + 'or else navigate to Crunchyroll in your browser, ' 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' 'and your browser\'s User-Agent (with --user-agent)', expected=True) From 145dc6f6563e80d2da1b3e9aea2ffa795b71622c Mon Sep 17 00:00:00 2001 From: Rasmus Antons Date: Wed, 8 May 2024 22:16:32 +0200 Subject: [PATCH 10/15] [ie/boosty] Add cookies support (#9522) Closes #9401 Authored by: RasmusAntons --- yt_dlp/extractor/boosty.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/boosty.py b/yt_dlp/extractor/boosty.py index fb14ca146..d3aab7a1a 100644 --- a/yt_dlp/extractor/boosty.py +++ b/yt_dlp/extractor/boosty.py @@ -1,7 +1,11 @@ +import json +import urllib.parse + from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( ExtractorError, + bug_reports_message, int_or_none, qualities, str_or_none, @@ -162,9 +166,19 @@ def _extract_formats(self, player_urls, video_id): def _real_extract(self, url): user, post_id = self._match_valid_url(url).group('user', 'post_id') + + auth_headers = {} + auth_cookie = self._get_cookies('https://boosty.to/').get('auth') + if auth_cookie is not None: + try: + auth_data = json.loads(urllib.parse.unquote(auth_cookie.value)) + auth_headers['Authorization'] = f'Bearer {auth_data["accessToken"]}' + except (json.JSONDecodeError, KeyError): + self.report_warning(f'Failed to extract token from auth cookie{bug_reports_message()}') + post = self._download_json( f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id, - note='Downloading post data', errnote='Unable to download post data') + note='Downloading post data', errnote='Unable to download post data', headers=auth_headers) post_title = post.get('title') if not post_title: @@ -202,7 +216,9 @@ def _real_extract(self, url): 'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}), }, get_all=False)}) - if not entries: + if not entries and not post.get('hasAccess'): + self.raise_login_required('This post requires a subscription', metadata_available=True) + elif not entries: raise ExtractorError('No videos found', expected=True) if len(entries) == 1: return entries[0] From b38018b781b062d5169d104ab430489aef8e7f1e Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Wed, 8 May 2024 20:51:16 +0000 Subject: [PATCH 11/15] [ie/mixch] Extract comments (#9860) Authored by: pzhlkj6612 --- yt_dlp/extractor/mixch.py | 41 +++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index b980fd01a..58c4a2301 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -1,6 +1,12 @@ from .common import InfoExtractor from ..networking.exceptions import HTTPError -from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none +from ..utils import ( + ExtractorError, + UserNotLive, + int_or_none, + str_or_none, + url_or_none, +) from ..utils.traversal import traverse_obj @@ -9,17 +15,20 @@ class MixchIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P\d+)' _TESTS = [{ - 'url': 'https://mixch.tv/u/16236849/live', + 'url': 'https://mixch.tv/u/16943797/live', 'skip': 'don\'t know if this live persists', 'info_dict': { - 'id': '16236849', - 'title': '24配信シェア⭕️投票🙏💦', - 'comment_count': 13145, - 'view_count': 28348, - 'timestamp': 1636189377, - 'uploader': '🦥伊咲👶🏻#フレアワ', - 'uploader_id': '16236849', - } + 'id': '16943797', + 'ext': 'mp4', + 'title': '#EntView #カリナ #セブチ 2024-05-05 06:58', + 'comment_count': int, + 'view_count': int, + 'timestamp': 1714726805, + 'uploader': 'Ent.View K-news🎶💕', + 'uploader_id': '16943797', + 'live_status': 'is_live', + 'upload_date': '20240503', + }, }, { 'url': 'https://mixch.tv/u/16137876/live', 'only_matching': True, @@ -48,8 +57,20 @@ def _real_extract(self, url): 'protocol': 'm3u8', }], 'is_live': True, + '__post_extractor': self.extract_comments(video_id), } + def _get_comments(self, video_id): + yield from traverse_obj(self._download_json( + f'https://mixch.tv/api-web/lives/{video_id}/messages', video_id, + note='Downloading comments', errnote='Failed to download comments'), (..., { + 'author': ('name', {str}), + 'author_id': ('user_id', {str_or_none}), + 'id': ('message_id', {str}, {lambda x: x or None}), + 'text': ('body', {str}), + 'timestamp': ('created', {int}), + })) + class MixchArchiveIE(InfoExtractor): IE_NAME = 'mixch:archive' From df5c9e733aaba703cf285c0372b6d61629330c82 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Wed, 8 May 2024 23:02:22 +0200 Subject: [PATCH 12/15] [ie/vk] Improve format extraction (#9885) Closes #5675 Authored by: seproDev --- yt_dlp/extractor/vk.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 7e3a3a9a9..28d502685 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -451,6 +451,7 @@ def _real_extract(self, url): info_page, 'view count', default=None)) formats = [] + subtitles = {} for format_id, format_url in data.items(): format_url = url_or_none(format_url) if not format_url or not format_url.startswith(('http', '//', 'rtmp')): @@ -462,12 +463,21 @@ def _real_extract(self, url): formats.append({ 'format_id': format_id, 'url': format_url, + 'ext': 'mp4', + 'source_preference': 1, 'height': height, }) elif format_id == 'hls': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( format_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False, live=is_live)) + m3u8_id=format_id, fatal=False, live=is_live) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_id.startswith('dash_'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif format_id == 'rtmp': formats.append({ 'format_id': format_id, @@ -475,7 +485,6 @@ def _real_extract(self, url): 'ext': 'flv', }) - subtitles = {} for sub in data.get('subs') or {}: subtitles.setdefault(sub.get('lang', 'en'), []).append({ 'ext': sub.get('title', '.srt').split('.')[-1], @@ -496,6 +505,7 @@ def _real_extract(self, url): 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, 'subtitles': subtitles, + '_format_sort_fields': ('res', 'source'), } From 06d52c87314e0bbc16c43c405090843885577b88 Mon Sep 17 00:00:00 2001 From: fireattack Date: Thu, 9 May 2024 05:09:38 +0800 Subject: [PATCH 13/15] [ie/BilibiliSpaceVideo] Better error message (#9839) Closes #9528 Authored by: fireattack --- yt_dlp/extractor/bilibili.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index fee4b2994..6221e9a51 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1049,9 +1049,10 @@ def fetch_page(page_idx): raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise - if response['code'] == -401: + if response['code'] in (-352, -401): raise ExtractorError( - 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) + f'Request is blocked by server ({-response["code"]}), ' + 'please add cookies, wait and try later.', expected=True) return response['data'] def get_metadata(page_data): From 2338827072dacab0f15348b70aec8685feefc8d1 Mon Sep 17 00:00:00 2001 From: fireattack Date: Thu, 9 May 2024 05:24:44 +0800 Subject: [PATCH 14/15] [ie/bilibili] Fix `--geo-verification-proxy` support (#9817) Closes #9797 Authored by: fireattack --- yt_dlp/extractor/bilibili.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 6221e9a51..df3470003 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -93,11 +93,11 @@ def extract_formats(self, play_info): return formats - def _download_playinfo(self, video_id, cid): + def _download_playinfo(self, video_id, cid, headers=None): return self._download_json( 'https://api.bilibili.com/x/player/playurl', video_id, query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, - note=f'Downloading video formats for cid {cid}')['data'] + note=f'Downloading video formats for cid {cid}', headers=headers)['data'] def json2srt(self, json_data): srt_data = '' @@ -493,7 +493,8 @@ class BiliBiliIE(BilibiliBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, video_id) + headers = self.geo_verification_headers() + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers) if not self._match_valid_url(urlh.url): return self.url_result(urlh.url) @@ -531,7 +532,7 @@ def _real_extract(self, url): self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, - note='Extracting videos in anthology'), + note='Extracting videos in anthology', headers=headers), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 @@ -552,7 +553,7 @@ def _real_extract(self, url): festival_info = {} if is_festival: - play_info = self._download_playinfo(video_id, cid) + play_info = self._download_playinfo(video_id, cid, headers=headers) festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), @@ -666,14 +667,15 @@ class BiliBiliBangumiIE(BilibiliBaseIE): def _real_extract(self, url): episode_id = self._match_id(url) - webpage = self._download_webpage(url, episode_id) + headers = self.geo_verification_headers() + webpage = self._download_webpage(url, episode_id, headers=headers) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - headers = {'Referer': url, **self.geo_verification_headers()} + headers['Referer'] = url play_info = self._download_json( 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, @@ -724,7 +726,7 @@ def _real_extract(self, url): 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), '__post_extractor': self.extract_comments(aid), - 'http_headers': headers, + 'http_headers': {'Referer': url}, } From c4b87dd885ee5391e5f481e7c8bd550a7c543623 Mon Sep 17 00:00:00 2001 From: src-tinkerer <149616646+src-tinkerer@users.noreply.github.com> Date: Wed, 8 May 2024 21:27:30 +0000 Subject: [PATCH 15/15] [ie/ZenYandex] Fix extractor (#9813) Closes #9803 Authored by: src-tinkerer --- yt_dlp/extractor/yandexvideo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index 4382a5684..95a9446e3 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -259,15 +259,15 @@ def _real_extract(self, url): webpage = self._download_webpage(redirect, video_id, note='Redirecting') data_json = self._search_json( r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') - serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', - webpage, 'server state').replace('State', 'Settings') + serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state') uploader = self._search_regex(r'(]+>)', webpage, 'uploader', default='') uploader_name = extract_attributes(uploader).get('aria-label') - video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) - stream_urls = try_get(video_json, lambda x: x['video']['streams']) + item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str})) + video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {} + formats, subtitles = [], {} - for s_url in stream_urls: + for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})): ext = determine_ext(s_url) if ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash')