From e4cf7741f9302b3faa092962f2895b55cb3d89bb Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 21 Mar 2023 17:48:22 -0500 Subject: [PATCH 001/501] [extractor/rozhlas] Extract manifest formats (#6590) Closes #6584 Authored by: bashonly --- yt_dlp/extractor/rozhlas.py | 80 ++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py index 08ebb93e3d..5cc664e00b 100644 --- a/yt_dlp/extractor/rozhlas.py +++ b/yt_dlp/extractor/rozhlas.py @@ -1,5 +1,12 @@ from .common import InfoExtractor -from ..utils import extract_attributes, int_or_none, remove_start, traverse_obj +from ..utils import ( + extract_attributes, + int_or_none, + remove_start, + str_or_none, + traverse_obj, + url_or_none, +) class RozhlasIE(InfoExtractor): @@ -50,7 +57,7 @@ class RozhlasVltavaIE(InfoExtractor): 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337', 'md5': 'ba2fdbc1242fc16771c7695d271ec355', 'info_dict': { - 'id': 8891337, + 'id': '8891337', 'title': 'md5:21f99739d04ab49d8c189ec711eef4ec', }, 'playlist_count': 1, @@ -69,7 +76,7 @@ class RozhlasVltavaIE(InfoExtractor): }, { 'url': 'https://wave.rozhlas.cz/poslechnete-si-neklid-podcastovy-thriller-o-vine-strachu-a-vztahu-ktery-zasel-8554744', 'info_dict': { - 'id': 8554744, + 'id': '8554744', 'title': 'Poslechněte si Neklid. Podcastový thriller o vině, strachu a vztahu, který zašel příliš daleko', }, 'playlist_count': 5, @@ -139,27 +146,62 @@ class RozhlasVltavaIE(InfoExtractor): 'chapter_number': 5, }, }] + }, { + 'url': 'https://dvojka.rozhlas.cz/karel-siktanc-cerny-jezdec-bily-kun-napinava-pohadka-o-tajemnem-prizraku-8946969', + 'info_dict': { + 'id': '8946969', + 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '10631121', + 'ext': 'm4a', + 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku', + 'description': 'Karel Šiktanc: Černý jezdec, bílý kůň', + 'duration': 2656, + 'artist': 'Tvůrčí skupina Drama a literatura', + 'channel_id': 'dvojka', + }, + }], + 'params': {'skip_download': 'dash'}, }] def _extract_video(self, entry): - chapter_number = int_or_none(traverse_obj(entry, ('meta', 'ga', 'contentSerialPart'))) + formats = [] + audio_id = entry['meta']['ga']['contentId'] + for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): + ext = audio.get('variant') + if ext == 'dash': + formats.extend(self._extract_mpd_formats( + audio['url'], audio_id, mpd_id=ext, fatal=False)) + elif ext == 'hls': + formats.extend(self._extract_m3u8_formats( + audio['url'], audio_id, 'm4a', m3u8_id=ext, fatal=False)) + else: + formats.append({ + 'url': audio['url'], + 'ext': ext, + 'format_id': ext, + 'abr': int_or_none(audio.get('bitrate')), + 'acodec': ext, + 'vcodec': 'none', + }) + + chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none})) + return { - 'id': entry['meta']['ga']['contentId'], - 'title': traverse_obj(entry, ('meta', 'ga', 'contentName')), - 'description': entry.get('title'), - 'duration': entry.get('duration'), - 'artist': traverse_obj(entry, ('meta', 'ga', 'contentAuthor')), - 'channel_id': traverse_obj(entry, ('meta', 'ga', 'contentCreator')), + 'id': audio_id, 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None, 'chapter_number': chapter_number, - 'formats': [{ - 'url': audio_link['url'], - 'ext': audio_link.get('variant'), - 'format_id': audio_link.get('variant'), - 'abr': audio_link.get('bitrate'), - 'acodec': audio_link.get('variant'), - 'vcodec': 'none', - } for audio_link in entry['audioLinks']], + 'formats': formats, + **traverse_obj(entry, { + 'title': ('meta', 'ga', 'contentName'), + 'description': 'title', + 'duration': ('duration', {int_or_none}), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + }) } def _real_extract(self, url): @@ -173,7 +215,7 @@ def _real_extract(self, url): return { '_type': 'playlist', - 'id': data.get('embedId'), + 'id': str_or_none(data.get('embedId')) or video_id, 'title': traverse_obj(data, ('series', 'title')), 'entries': map(self._extract_video, data['playlist']), } From 06966cb8966b9aa4f60ab9c44c182a057d4ca3a3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 21 Mar 2023 17:57:46 -0500 Subject: [PATCH 002/501] [extractor/bravotv] Fix extractor (#6568) Closes #6562 Authored by: bashonly --- yt_dlp/extractor/bravotv.py | 232 +++++++++++++++++++++++------------- 1 file changed, 150 insertions(+), 82 deletions(-) diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py index d4895848e0..d4bf9b53b7 100644 --- a/yt_dlp/extractor/bravotv.py +++ b/yt_dlp/extractor/bravotv.py @@ -1,117 +1,185 @@ -import re - from .adobepass import AdobePassIE from ..utils import ( - smuggle_url, - update_url_query, - int_or_none, + extract_attributes, float_or_none, - try_get, - dict_get, + get_element_html_by_class, + int_or_none, + merge_dicts, + parse_age_limit, + remove_end, + str_or_none, + traverse_obj, + unescapeHTML, + unified_timestamp, + update_url_query, + url_or_none, ) class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { - 'id': 'epL0pmK1kQlT', + 'id': '3923059', 'ext': 'mp4', 'title': 'The Top Chef Season 16 Winner Is...', 'description': 'Find out who takes the title of Top Chef!', - 'uploader': 'NBCU-BRAV', 'upload_date': '20190314', 'timestamp': 1552591860, 'season_number': 16, 'episode_number': 15, 'series': 'Top Chef', 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.0, - } + 'duration': 190.357, + 'season': 'Season 16', + 'thumbnail': r're:^https://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, + 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', + 'info_dict': { + 'id': '9000234570', + 'ext': 'mp4', + 'title': 'London Calling', + 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', + 'upload_date': '20230310', + 'timestamp': 1678410000, + 'season_number': 20, + 'episode_number': 1, + 'series': 'Top Chef', + 'episode': 'London Calling', + 'duration': 3266.03, + 'season': 'Season 20', + 'chapters': 'count:7', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', + 'info_dict': { + 'id': '3692045', + 'ext': 'mp4', + 'title': 'Closing Night', + 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', + 'upload_date': '20180401', + 'timestamp': 1522623600, + 'season_number': 1, + 'episode_number': 1, + 'series': 'In Ice Cold Blood', + 'episode': 'Closing Night', + 'duration': 2629.051, + 'season': 'Season 1', + 'chapters': 'count:6', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', }, { 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'info_dict': { + 'id': '3974019', + 'ext': 'mp4', + 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', + 'upload_date': '20190617', + 'timestamp': 1560790800, + 'season_number': 2, + 'episode_number': 16, + 'series': 'In Ice Cold Blood', + 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'duration': 68.235, + 'season': 'Season 2', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, }] def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() + site, display_id = self._match_valid_url(url).group('site', 'id') webpage = self._download_webpage(url, display_id) - settings = self._parse_json(self._search_regex( - r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), - display_id) - info = {} + settings = self._search_json( + r']+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') query = { - 'mbr': 'true', + 'manifest': 'm3u', + 'formats': 'm3u,mpeg4', } - account_pid, release_pid = [None] * 2 - tve = settings.get('ls_tve') + if tve: - query['manifest'] = 'm3u' - mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) - if mobj: - account_pid, tp_path = mobj.groups() - release_pid = tp_path.strip('/').split('/')[-1] - else: - account_pid = 'HNK2IC' - tp_path = release_pid = tve['release_pid'] - if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('tve_adobe_auth', {}) - if site == 'bravotv': - site = 'bravo' + account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} + site = remove_end(site, 'tv') + release_pid = tve['data-release-pid'] resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId') or site, - tve['title'], release_pid, tve.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, release_pid, - adobe_pass.get('adobePassRequestorId') or site, resource) + tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, + tve['data-title'], release_pid, tve.get('data-rating')) + query.update({ + 'switch': 'HLSServiceSecure', + 'auth': self._extract_mvpd_auth( + url, release_pid, auth.get('adobePassRequestorId') or site, resource), + }) + else: - shared_playlist = settings['ls_playlist'] - account_pid = shared_playlist['account_pid'] - metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - tp_path = release_pid = metadata.get('release_pid') - if not release_pid: - release_pid = metadata['guid'] - tp_path = 'media/guid/2140479951/' + release_pid - info.update({ - 'title': metadata['title'], - 'description': metadata.get('description'), - 'season_number': int_or_none(metadata.get('season_num')), - 'episode_number': int_or_none(metadata.get('episode_num')), - }) - query['switch'] = 'progressive' - - tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path) + ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} + account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' + account_id = ls_playlist['mpxMediaAccountId'] + video_id = ls_playlist['defaultGuid'] + metadata = traverse_obj( + ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) + tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), - display_id, fatal=False) - if tp_metadata: - info.update({ - 'title': tp_metadata.get('title'), - 'description': tp_metadata.get('description'), - 'duration': float_or_none(tp_metadata.get('duration'), 1000), - 'season_number': int_or_none( - dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))), - 'episode_number': int_or_none( - dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))), - # For some reason the series is sometimes wrapped into a single element array. - 'series': try_get( - dict_get(tp_metadata, ('pl1$show', 'nbcu$show')), - lambda x: x[0] if isinstance(x, list) else x, - expected_type=str), - 'episode': dict_get( - tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')), - }) + update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'id': release_pid, - 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}), - 'ie_key': 'ThePlatform', - }) - return info + seconds_or_none = lambda x: float_or_none(x, 1000) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {seconds_or_none}), + 'end_time': ('endTime', {seconds_or_none}), + })) + # prune pointless single chapters that span the entire duration from short videos + if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): + chapters = None + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + update_url_query(f'{tp_url}/stream.m3u8', query), video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + **merge_dicts(traverse_obj(tp_metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {seconds_or_none}), + 'timestamp': ('pubDate', {seconds_or_none}), + 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), + 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), + 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), + 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), + }, get_all=False), traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {unified_timestamp}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': 'episodeTitle', + 'series': 'show', + })) + } From c2e0fc40a73dd85ab3920f977f579d475e66ef59 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 21 Mar 2023 18:12:17 -0500 Subject: [PATCH 003/501] [extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567) Authored by: bashonly --- README.md | 2 ++ yt_dlp/extractor/generic.py | 32 +++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index de83e421fb..9ce85d6319 100644 --- a/README.md +++ b/README.md @@ -1798,6 +1798,8 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) #### generic * `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg +* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs +* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 49aa5a1f5c..075bb36ded 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -24,6 +24,7 @@ mimetype2ext, orderedSet, parse_duration, + parse_qs, parse_resolution, smuggle_url, str_or_none, @@ -32,6 +33,7 @@ unescapeHTML, unified_timestamp, unsmuggle_url, + update_url_query, url_or_none, urljoin, variadic, @@ -2184,12 +2186,21 @@ def report_detected(self, name, num=1, note=None): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') - def _fragment_query(self, url): + def _extra_manifest_info(self, info, manifest_url): if self._configuration_arg('fragment_query'): - query_string = urllib.parse.urlparse(url).query + query_string = urllib.parse.urlparse(manifest_url).query if query_string: - return {'extra_param_to_segment_url': query_string} - return {} + info['extra_param_to_segment_url'] = query_string + + hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key'), { + 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), + }) or None + + if self._configuration_arg('variant_query'): + query = parse_qs(manifest_url) + for fmt in self._downloader._get_formats(info): + fmt['url'] = update_url_query(fmt['url'], query) def _extract_rss(self, url, video_id, doc): NS_MAP = { @@ -2397,10 +2408,8 @@ def _real_extract(self, url): subtitles = {} if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) - info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd': formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) - info_dict.update(self._fragment_query(url)) elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: @@ -2415,6 +2424,7 @@ def _real_extract(self, url): 'subtitles': subtitles, 'http_headers': headers or None, }) + self._extra_manifest_info(info_dict, url) return info_dict if not self.get_param('test', False) and not is_intentional: @@ -2427,7 +2437,7 @@ def _real_extract(self, url): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) return info_dict # Maybe it's a direct link to a video? @@ -2478,7 +2488,7 @@ def _real_extract(self, url): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2592,7 +2602,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) for fmt in formats: - fmt.update(self._fragment_query(src)) + self._extra_manifest_info(fmt, src) if not formats: formats.append({ @@ -2795,10 +2805,10 @@ def filter_video(urls): return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: From 44369c9afa996e14e9f466754481d878811b5b4a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 23 Mar 2023 11:18:42 -0500 Subject: [PATCH 004/501] [extractor/cbs] Add `ParamountPressExpress` extractor (#6604) Closes #6597 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/brightcove.py | 6 +- yt_dlp/extractor/cbs.py | 113 ++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 01281b5a15..6c948e5fce 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -298,7 +298,10 @@ CBCGemPlaylistIE, CBCGemLiveIE, ) -from .cbs import CBSIE +from .cbs import ( + CBSIE, + ParamountPressExpressIE, +) from .cbslocal import ( CBSLocalIE, CBSLocalArticleIE, diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 2b7ddcae8d..cd0e8ff275 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -575,6 +575,7 @@ def build_format_id(kind): self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + headers.pop('Authorization', None) # or else http formats will give error 400 for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -895,8 +896,9 @@ def extract_policy_key(): store_pk(policy_key) return policy_key - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} + token = smuggled_data.get('token') + api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}' + headers = {'Authorization': f'Bearer {token}'} if token else {} referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key if referrer: headers.update({ diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 9aacd50c45..1c0dbdea94 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -1,8 +1,14 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE +from .youtube import YoutubeIE from ..utils import ( ExtractorError, + extract_attributes, + get_element_html_by_id, int_or_none, find_xpath_attr, + smuggle_url, xpath_element, xpath_text, update_url_query, @@ -162,3 +168,110 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')), }) + + +class ParamountPressExpressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?Pyt-)?video/?\?watch=(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx', + 'md5': '56631dbcadaab980d1fc47cb7b76cba4', + 'info_dict': { + 'id': '6322981580112', + 'ext': 'mp4', + 'title': 'I’m Felicia', + 'description': 'md5:88fad93f8eede1c9c8f390239e4c6290', + 'uploader_id': '6055873637001', + 'upload_date': '20230320', + 'timestamp': 1679334960, + 'duration': 49.557, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc', + 'md5': 'edcb03e3210b88a3e56c05aa863e0e5b', + 'info_dict': { + 'id': '6323036027112', + 'ext': 'mp4', + 'title': '‘Y&R’ Set Visit: Jerry O’Connell Quizzes Cast on Pre-Love Scene Rituals and More', + 'description': 'md5:b929867a357aac5544b783d834c78383', + 'uploader_id': '6055873637001', + 'upload_date': '20230321', + 'timestamp': 1679430180, + 'duration': 132.032, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck', + 'info_dict': { + 'id': 'OX9wJWOcqck', + 'ext': 'mp4', + 'title': 'Rugrats | Season 2 Official Trailer | Paramount+', + 'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de', + 'uploader': 'Paramount Plus', + 'uploader_id': '@paramountplus', + 'uploader_url': 'http://www.youtube.com/@paramountplus', + 'channel': 'Paramount Plus', + 'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg', + 'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg', + 'upload_date': '20230316', + 'duration': 88, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': ['Rugrats'], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw', + 'info_dict': { + 'id': '_ljssSoDLkw', + 'ext': 'mp4', + 'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME', + 'description': 'md5:39581bcc3fd810209b642609f448af70', + 'uploader': 'SHOWTIME', + 'uploader_id': '@Showtime', + 'uploader_url': 'http://www.youtube.com/@Showtime', + 'channel': 'SHOWTIME', + 'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ', + 'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ', + 'upload_date': '20230209', + 'duration': 49, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp', + 'categories': ['People & Blogs'], + 'tags': 'count:27', + }, + }] + + def _real_extract(self, url): + display_id, is_youtube = self._match_valid_url(url).group('id', 'yt') + if is_youtube: + return self.url_result(display_id, YoutubeIE) + + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID') + token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token') + + player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '') + account_id = player.get('data-account') or '6055873637001' + player_id = player.get('data-player') or 'OtLKgXlO9F' + embed = player.get('data-embed') or 'default' + + return self.url_result(smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}', + {'token': token}), BrightcoveNewIE) From 69b2f838d3d3e37dc17367ef64d978db1bea45cf Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 23 Mar 2023 11:19:37 -0500 Subject: [PATCH 005/501] [extractor/telecaribe] Expand livestream support (#6601) Closes #6598 Authored by: bashonly --- yt_dlp/extractor/telecaribe.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/telecaribe.py b/yt_dlp/extractor/telecaribe.py index b6d88a8090..91118a1a4a 100644 --- a/yt_dlp/extractor/telecaribe.py +++ b/yt_dlp/extractor/telecaribe.py @@ -38,11 +38,23 @@ class TelecaribePlayIE(InfoExtractor): 'params': { 'skip_download': 'Livestream', } + }, { + 'url': 'https://www.play.telecaribe.co/liveplus', + 'info_dict': { + 'id': 'liveplus', + 'title': r're:^Señal en vivo Plus', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + 'skip': 'Geo-restricted to Colombia', }] def _download_player_webpage(self, webpage, display_id): page_id = self._search_regex( - (r'window.firstPageId\s*=\s*["\']([^"\']+)', r']+id\s*=\s*"pageBackground_([^"]+)'), + (r'window\.firstPageId\s*=\s*["\']([^"\']+)', r']+id\s*=\s*"pageBackground_([^"]+)'), webpage, 'page_id') props = self._download_json(self._search_regex( @@ -59,14 +71,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) player = self._download_player_webpage(webpage, display_id) - if display_id != 'live': + livestream_url = self._search_regex( + r'(?:let|const|var)\s+source\s*=\s*["\']([^"\']+)', player, 'm3u8 url', default=None) + + if not livestream_url: return self.playlist_from_matches( re.findall(r']+href\s*=\s*"([^"]+\.mp4)', player), display_id, self._get_clean_title(self._og_search_title(webpage))) formats, subtitles = self._extract_m3u8_formats_and_subtitles( - self._search_regex(r'(?:let|const|var)\s+source\s*=\s*["\']([^"\']+)', player, 'm3u8 url'), - display_id, 'mp4') + livestream_url, display_id, 'mp4', live=True) return { 'id': display_id, From 78bc1868ff3352108ab2911033d1ac67a55f151e Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 23 Mar 2023 15:16:02 +0530 Subject: [PATCH 006/501] [extractor/rumble] Detect timeline format Closes #6607 --- yt_dlp/extractor/rumble.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 97f81446c7..834fe704f3 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -7,6 +7,7 @@ ExtractorError, UnsupportedError, clean_html, + determine_ext, get_element_by_class, int_or_none, parse_count, @@ -175,12 +176,16 @@ def _real_extract(self, url): video_info['url'], video_id, ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) continue + timeline = ext == 'timeline' + if timeline: + ext = determine_ext(video_info['url']) formats.append({ 'ext': ext, + 'acodec': 'none' if timeline else None, 'url': video_info['url'], 'format_id': '%s-%sp' % (ext, height), - 'height': int_or_none(height), - 'fps': video.get('fps'), + 'format_note': 'Timeline' if timeline else None, + 'fps': None if timeline else video.get('fps'), **traverse_obj(meta, { 'tbr': 'bitrate', 'filesize': 'size', @@ -247,6 +252,25 @@ class RumbleIE(InfoExtractor): }, { 'url': 'http://www.rumble.com/vDMUM1?key=value', 'only_matching': True, + }, { + 'note': 'timeline format', + 'url': 'https://rumble.com/v2ea9qb-the-u.s.-cannot-hide-this-in-ukraine-anymore-redacted-with-natali-and-clayt.html', + 'md5': '40d61fec6c0945bca3d0e1dc1aa53d79', + 'params': {'format': 'wv'}, + 'info_dict': { + 'id': 'v2bou5f', + 'ext': 'mp4', + 'uploader': 'Redacted News', + 'upload_date': '20230322', + 'timestamp': 1679445010, + 'title': 'The U.S. CANNOT hide this in Ukraine anymore | Redacted with Natali and Clayton Morris', + 'duration': 892, + 'channel': 'Redacted News', + 'description': 'md5:aaad0c5c3426d7a361c29bdaaced7c42', + 'channel_url': 'https://rumble.com/c/Redacted', + 'live_status': 'not_live', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', + }, }] _WEBPAGE_TESTS = [{ From 6994afc030d2a786d8032075ed71a14d7eac5a4f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 23 Mar 2023 19:09:29 +0530 Subject: [PATCH 007/501] [extractor/rumble] Fix videos without quality selection Closes #6612 --- yt_dlp/extractor/rumble.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 834fe704f3..98f660f8b6 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -8,8 +8,10 @@ UnsupportedError, clean_html, determine_ext, + format_field, get_element_by_class, int_or_none, + join_nonempty, parse_count, parse_iso8601, traverse_obj, @@ -165,7 +167,13 @@ def _real_extract(self, url): formats = [] for ext, ext_info in (video.get('ua') or {}).items(): - for height, video_info in (ext_info or {}).items(): + if isinstance(ext_info, dict): + for height, video_info in ext_info.items(): + if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): + video_info.setdefault('meta', {})['h'] = height + ext_info = ext_info.values() + + for video_info in ext_info: meta = video_info.get('meta') or {} if not video_info.get('url'): continue @@ -183,7 +191,7 @@ def _real_extract(self, url): 'ext': ext, 'acodec': 'none' if timeline else None, 'url': video_info['url'], - 'format_id': '%s-%sp' % (ext, height), + 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), 'format_note': 'Timeline' if timeline else None, 'fps': None if timeline else video.get('fps'), **traverse_obj(meta, { @@ -271,6 +279,24 @@ class RumbleIE(InfoExtractor): 'live_status': 'not_live', 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', }, + }, { + 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html', + 'info_dict': { + 'id': 'v2blzyy', + 'ext': 'mp4', + 'live_status': 'was_live', + 'release_timestamp': 1679446804, + 'description': 'md5:2ac4908ccfecfb921f8ffa4b30c1e636', + 'release_date': '20230322', + 'timestamp': 1679445692, + 'duration': 4435, + 'upload_date': '20230322', + 'title': 'The Covid Twitter Files Drop: Protecting Fauci While Censoring The Truth w/Matt Taibbi', + 'uploader': 'Kim Iversen', + 'channel_url': 'https://rumble.com/c/KimIversen', + 'channel': 'Kim Iversen', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg', + }, }] _WEBPAGE_TESTS = [{ From 5cc0a8fd2e9fec50026fb92170b57993af939e4a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 23 Mar 2023 11:28:23 -0500 Subject: [PATCH 008/501] [extractor/generic] Accept values for `fragment_query`, `variant_query` (#6600) Closes #6593 Authored by: bashonly --- README.md | 4 ++-- yt_dlp/extractor/generic.py | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9ce85d6319..3e8484314f 100644 --- a/README.md +++ b/README.md @@ -1797,8 +1797,8 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off #### generic -* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg -* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs +* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg +* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE` * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist #### funimation diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 075bb36ded..f9fa01feb8 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -24,7 +24,6 @@ mimetype2ext, orderedSet, parse_duration, - parse_qs, parse_resolution, smuggle_url, str_or_none, @@ -2187,18 +2186,23 @@ def report_detected(self, name, num=1, note=None): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') def _extra_manifest_info(self, info, manifest_url): - if self._configuration_arg('fragment_query'): - query_string = urllib.parse.urlparse(manifest_url).query - if query_string: - info['extra_param_to_segment_url'] = query_string + fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0] + if fragment_query is not None: + fragment_query = self._configuration_arg('fragment_query', casesense=True)[0] + info['extra_param_to_segment_url'] = ( + urllib.parse.urlparse(fragment_query).query or fragment_query + or urllib.parse.urlparse(manifest_url).query or None) hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None - info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key'), { + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), }) or None - if self._configuration_arg('variant_query'): - query = parse_qs(manifest_url) + variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0] + if variant_query is not None: + query = urllib.parse.parse_qs( + urllib.parse.urlparse(variant_query).query or variant_query + or urllib.parse.urlparse(manifest_url).query) for fmt in self._downloader._get_formats(info): fmt['url'] = update_url_query(fmt['url'], query) From 3ae182ad89e1427ff7b1684d6a44ff93fa857a0c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 23 Mar 2023 13:45:27 -0500 Subject: [PATCH 009/501] [extractor/pgatour] Add extractor (#6613) Closes #6537 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/pgatour.py | 47 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 yt_dlp/extractor/pgatour.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6c948e5fce..4a4d38cafb 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1393,6 +1393,7 @@ PeriscopeIE, PeriscopeUserIE, ) +from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/yt_dlp/extractor/pgatour.py b/yt_dlp/extractor/pgatour.py new file mode 100644 index 0000000000..36c2c6207d --- /dev/null +++ b/yt_dlp/extractor/pgatour.py @@ -0,0 +1,47 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + + +class PGATourIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pgatour\.com/video/[\w-]+/(?PT)?(?P\d+)' + _TESTS = [{ + 'url': 'https://www.pgatour.com/video/competition/T6322447785112/adam-hadwin-2023-the-players-round-4-18th-hole-shot-1', + 'info_dict': { + 'id': '6322447785112', + 'ext': 'mp4', + 'title': 'Adam Hadwin | 2023 THE PLAYERS | Round 4 | 18th hole | Shot 1', + 'uploader_id': '6116716431001', + 'upload_date': '20230312', + 'timestamp': 1678653136, + 'duration': 20.011, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:7', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.pgatour.com/video/features/6322506425112/follow-the-players-trophy-on-championship-sunday', + 'info_dict': { + 'id': '6322506425112', + 'ext': 'mp4', + 'title': 'Follow THE PLAYERS trophy on Championship Sunday', + 'description': 'md5:4d29e4bdfa03694a0ebfd08950398568', + 'uploader_id': '6082840763001', + 'upload_date': '20230313', + 'timestamp': 1678739835, + 'duration': 123.435, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:8', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, is_tourcast = self._match_valid_url(url).group('id', 'tc') + + # From https://www.pgatour.com/_next/static/chunks/pages/_app-8bcf849560daf38d.js + account_id = '6116716431001' if is_tourcast else '6082840763001' + player_id = 'Vsd5Umu8r' if is_tourcast else 'FWIBYMBPj' + + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE) From 6bdb64e2a2a6d504d8ce1dc830fbfb8a7f199c63 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 23 Mar 2023 13:45:56 -0500 Subject: [PATCH 010/501] [extractor/hollywoodreporter] Add extractors (#6614) Closes #6525 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/hollywoodreporter.py | 72 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 yt_dlp/extractor/hollywoodreporter.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4a4d38cafb..69464b6f00 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -719,6 +719,10 @@ from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE +from .hollywoodreporter import ( + HollywoodReporterIE, + HollywoodReporterPlaylistIE, +) from .holodex import HolodexIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( diff --git a/yt_dlp/extractor/hollywoodreporter.py b/yt_dlp/extractor/hollywoodreporter.py new file mode 100644 index 0000000000..1f7eb89bc9 --- /dev/null +++ b/yt_dlp/extractor/hollywoodreporter.py @@ -0,0 +1,72 @@ +import functools +import re + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from ..utils import ( + ExtractorError, + OnDemandPagedList, + extract_attributes, + get_element_by_class, + get_element_html_by_class, +) + + +class HollywoodReporterIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/video/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/video/chris-pine-michelle-rodriguez-dungeons-dragons-cast-directors-on-what-it-took-to-make-film-sxsw-2023/', + 'info_dict': { + 'id': 'zH4jZaR5', + 'ext': 'mp4', + 'title': 'md5:a9a1c073770a32f178955997712c4bd9', + 'description': 'The cast and directors of \'Dungeons & Dragons: Honor Among Thieves\' talk about their new film.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/zH4jZaR5/poster.jpg?width=720', + 'upload_date': '20230312', + 'timestamp': 1678586423, + 'duration': 242.0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + data = extract_attributes(get_element_html_by_class('vlanding-video-card__link', webpage) or '') + video_id = data['data-video-showcase-trigger'] + showcase_type = data['data-video-showcase-type'] + + if showcase_type == 'jwplayer': + return self.url_result(f'jwplatform:{video_id}', JWPlatformIE) + elif showcase_type == 'youtube': + return self.url_result(video_id, 'Youtube') + else: + raise ExtractorError(f'Unsupported showcase type "{showcase_type}"') + + +class HollywoodReporterPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/vcategory/(?P[\w-]+)-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/vcategory/heat-vision-breakdown-57822/', + 'playlist_mincount': 109, + 'info_dict': { + 'id': '57822', + 'title': 'heat-vision-breakdown', + } + }] + + def _fetch_page(self, slug, pl_id, page): + page += 1 + webpage = self._download_webpage( + f'https://www.hollywoodreporter.com/vcategory/{slug}-{pl_id}/page/{page}/', + pl_id, note=f'Downloading playlist page {page}') + section = get_element_by_class('video-playlist-river', webpage) or '' + + for url in re.findall(r']+href="([^"]+)"[^>]+class="c-title__link', section): + yield self.url_result(url, HollywoodReporterIE) + + def _real_extract(self, url): + slug, pl_id = self._match_valid_url(url).group('slug', 'id') + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, slug, pl_id), 15), pl_id, slug) From 8ceb07e870424c219dced8f4348729553f05c5cc Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 23 Mar 2023 13:46:33 -0500 Subject: [PATCH 011/501] [extractor/tiktok] Fix mp3 formats (#6615) Closes #6608 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index f1696a2fcb..fb838d5298 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -13,6 +13,7 @@ LazyList, UnsupportedError, UserNotLive, + determine_ext, format_field, get_element_by_id, get_first, @@ -204,6 +205,16 @@ def parse_url_key(url_key): known_resolutions = {} + def mp3_meta(url): + return { + 'format_note': 'Music track', + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + 'width': None, + 'height': None, + } if determine_ext(url) == 'mp3' else {} + def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) if res: @@ -219,7 +230,8 @@ def extract_addr(addr, add_meta={}): 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, 'format_note': join_nonempty( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ') + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '), + **mp3_meta(url), } for url in addr.get('url_list') or []] # Hack: Add direct video links first to prioritize them when removing duplicate formats @@ -553,6 +565,28 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, }, 'skip': 'This video is unavailable', + }, { + # slideshow audio-only mp3 format + 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283', + 'info_dict': { + 'id': '7139980461132074283', + 'ext': 'mp3', + 'title': 'TikTok video #7139980461132074283', + 'description': '', + 'creator': 'Antaura', + 'uploader': '_le_cannibale_', + 'uploader_id': '6604511138619654149', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', + 'artist': 'nathan !', + 'track': 'grahamscott canon', + 'upload_date': '20220905', + 'timestamp': 1662406249, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', From 9bfe0d15bd7dbdc6b0e6378fa9f5e2e289b2373b Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 23 Mar 2023 14:28:31 -0500 Subject: [PATCH 012/501] Fix 5cc0a8fd2e9fec50026fb92170b57993af939e4a Authored by: bashonly --- yt_dlp/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f9fa01feb8..75355aeb5b 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2188,7 +2188,6 @@ def report_detected(self, name, num=1, note=None): def _extra_manifest_info(self, info, manifest_url): fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0] if fragment_query is not None: - fragment_query = self._configuration_arg('fragment_query', casesense=True)[0] info['extra_param_to_segment_url'] = ( urllib.parse.urlparse(fragment_query).query or fragment_query or urllib.parse.urlparse(manifest_url).query or None) From baa922b5c74b10e3b86ff5e6cf6529b3aae8efab Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 24 Mar 2023 21:53:45 +0530 Subject: [PATCH 013/501] [extractor] Do not exit early for unsuitable `url_result` --- yt_dlp/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 2091df7faf..5da12725ae 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3513,8 +3513,8 @@ def _RETURN_TYPE(cls): @classmethod def is_single_video(cls, url): """Returns whether the URL is of a single video, None if unknown""" - assert cls.suitable(url), 'The URL must be suitable for the extractor' - return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + if cls.suitable(url): + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) @classmethod def is_suitable(cls, age_limit): From f68434cc74cfd3db01b266476a2eac8329fbb267 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 24 Mar 2023 21:53:06 +0530 Subject: [PATCH 014/501] [extractor] Extract more metadata from ISM Fixes https://github.com/yt-dlp/yt-dlp/commit/81b6102d2099eec78a2db9ae3d101a8503dd4f25#r105892531 --- test/test_InfoExtractor.py | 138 +++++++++++++------------------------ yt_dlp/extractor/common.py | 2 + 2 files changed, 49 insertions(+), 91 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index e8d94a6ac2..1f60abfd25 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1406,6 +1406,7 @@ def test_parse_ism_formats(self): 'vcodec': 'none', 'acodec': 'AACL', 'protocol': 'ism', + 'audio_channels': 2, '_download_params': { 'stream_type': 'audio', 'duration': 8880746666, @@ -1419,9 +1420,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'audio_ext': 'isma', - 'video_ext': 'none', - 'abr': 128, }, { 'format_id': 'video-100', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1445,9 +1443,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 100, }, { 'format_id': 'video-326', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1471,9 +1466,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 326, }, { 'format_id': 'video-698', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1497,9 +1489,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 698, }, { 'format_id': 'video-1493', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1523,9 +1512,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 1493, }, { 'format_id': 'video-4482', 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest', @@ -1549,9 +1535,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 4482, }], { 'eng': [ @@ -1575,34 +1558,6 @@ def test_parse_ism_formats(self): 'ec-3_test', 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', [{ - 'format_id': 'audio_deu_1-224', - 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', - 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', - 'ext': 'isma', - 'tbr': 224, - 'asr': 48000, - 'vcodec': 'none', - 'acodec': 'EC-3', - 'protocol': 'ism', - '_download_params': - { - 'stream_type': 'audio', - 'duration': 370000000, - 'timescale': 10000000, - 'width': 0, - 'height': 0, - 'fourcc': 'EC-3', - 'language': 'deu', - 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00', - 'sampling_rate': 48000, - 'channels': 6, - 'bits_per_sample': 16, - 'nal_unit_length_field': 4 - }, - 'audio_ext': 'isma', - 'video_ext': 'none', - 'abr': 224, - }, { 'format_id': 'audio_deu-127', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1612,8 +1567,9 @@ def test_parse_ism_formats(self): 'vcodec': 'none', 'acodec': 'AACL', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + 'audio_channels': 2, + '_download_params': { 'stream_type': 'audio', 'duration': 370000000, 'timescale': 10000000, @@ -1627,9 +1583,32 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'audio_ext': 'isma', - 'video_ext': 'none', - 'abr': 127, + }, { + 'format_id': 'audio_deu_1-224', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'isma', + 'tbr': 224, + 'asr': 48000, + 'vcodec': 'none', + 'acodec': 'EC-3', + 'protocol': 'ism', + 'language': 'deu', + 'audio_channels': 6, + '_download_params': { + 'stream_type': 'audio', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 0, + 'height': 0, + 'fourcc': 'EC-3', + 'language': 'deu', + 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00', + 'sampling_rate': 48000, + 'channels': 6, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, }, { 'format_id': 'video_deu-23', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1641,8 +1620,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1655,9 +1634,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 23, }, { 'format_id': 'video_deu-403', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1669,8 +1645,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1683,9 +1659,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 403, }, { 'format_id': 'video_deu-680', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1697,8 +1670,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1711,9 +1684,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 680, }, { 'format_id': 'video_deu-1253', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1725,8 +1695,9 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'vbr': 1253, + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1739,9 +1710,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 1253, }, { 'format_id': 'video_deu-2121', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1753,8 +1721,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1767,9 +1735,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 2121, }, { 'format_id': 'video_deu-3275', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1781,8 +1746,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1795,9 +1760,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 3275, }, { 'format_id': 'video_deu-5300', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1809,8 +1771,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1823,9 +1785,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 5300, }, { 'format_id': 'video_deu-8079', 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', @@ -1837,8 +1796,8 @@ def test_parse_ism_formats(self): 'vcodec': 'AVC1', 'acodec': 'none', 'protocol': 'ism', - '_download_params': - { + 'language': 'deu', + '_download_params': { 'stream_type': 'video', 'duration': 370000000, 'timescale': 10000000, @@ -1851,9 +1810,6 @@ def test_parse_ism_formats(self): 'bits_per_sample': 16, 'nal_unit_length_field': 4 }, - 'video_ext': 'ismv', - 'audio_ext': 'none', - 'vbr': 8079, }], {}, ), diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 5da12725ae..838899052c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2983,6 +2983,8 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): 'protocol': 'ism', 'fragments': fragments, 'has_drm': ism_doc.find('Protection') is not None, + 'language': stream_language, + 'audio_channels': int_or_none(track.get('Channels')), '_download_params': { 'stream_type': stream_type, 'duration': duration, From 0898c5c8ccadfc404472456a7a7751b72afebadd Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 25 Mar 2023 19:41:28 +0100 Subject: [PATCH 015/501] [utils] `js_to_json`: Implement template strings (#6623) Authored by: Grub4K --- test/test_utils.py | 7 +++++++ yt_dlp/utils.py | 11 +++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3045b6d7e1..d4a301583f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1190,6 +1190,13 @@ def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') + def test_js_to_json_template_literal(self): + self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"') + self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"') + self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"') + self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') + self.assertEqual(js_to_json('`${name}`', {}), '"name"') + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 8c2c5593cc..40533c2cb4 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3366,7 +3366,7 @@ def strip_jsonp(code): def js_to_json(code, vars={}, *, strict=False): # vars is a dict of var, val pairs to substitute - STRING_QUOTES = '\'"' + STRING_QUOTES = '\'"`' STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES) COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' @@ -3384,6 +3384,12 @@ def process_escape(match): else '' if escape == '\n' else escape) + def template_substitute(match): + evaluated = js_to_json(match.group(1), vars, strict=strict) + if evaluated[0] == '"': + return json.loads(evaluated) + return evaluated + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): @@ -3394,7 +3400,8 @@ def fix_kv(m): return '' if v[0] in STRING_QUOTES: - escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1]) + v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1] + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v) return f'"{escaped}"' for regex, base in INTEGER_TABLE: From 33b737bedf8383c0d00d4e1d06a5273dcdfdb756 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 26 Mar 2023 17:16:42 -0500 Subject: [PATCH 016/501] [extractor/triller] Support short URLs, detect removed videos (#6636) Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/triller.py | 307 ++++++++++++++++++-------------- 2 files changed, 174 insertions(+), 134 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 69464b6f00..a97c458fa6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1962,6 +1962,7 @@ from .triller import ( TrillerIE, TrillerUserIE, + TrillerShortIE, ) from .trilulilu import TriluliluIE from .trovo import ( diff --git a/yt_dlp/extractor/triller.py b/yt_dlp/extractor/triller.py index acd9e68d25..6a4dadb9bd 100644 --- a/yt_dlp/extractor/triller.py +++ b/yt_dlp/extractor/triller.py @@ -1,15 +1,21 @@ import itertools import json +import re from .common import InfoExtractor from ..utils import ( ExtractorError, + HEADRequest, + UnsupportedError, + determine_ext, int_or_none, + parse_resolution, str_or_none, traverse_obj, - unified_strdate, unified_timestamp, url_basename, + urljoin, + url_or_none, ) @@ -22,25 +28,22 @@ def _perform_login(self, username, password): if self._API_HEADERS.get('Authorization'): return - user_check = self._download_json( + headers = {**self._API_HEADERS, 'Content-Type': 'application/json'} + user_check = traverse_obj(self._download_json( f'{self._API_BASE_URL}/api/user/is-valid-username', None, note='Checking username', - fatal=False, expected_status=400, headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://triller.co', - }, data=json.dumps({'username': username}, separators=(',', ':')).encode('utf-8')) - if user_check.get('status'): # endpoint returns "status":false if username exists + fatal=False, expected_status=400, headers=headers, + data=json.dumps({'username': username}, separators=(',', ':')).encode()), 'status') + + if user_check: # endpoint returns `"status":false` if username exists raise ExtractorError('Unable to login: Invalid username', expected=True) - credentials = { - 'username': username, - 'password': password, - } login = self._download_json( - f'{self._API_BASE_URL}/user/auth', None, note='Logging in', - fatal=False, expected_status=400, headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://triller.co', - }, data=json.dumps(credentials, separators=(',', ':')).encode('utf-8')) + f'{self._API_BASE_URL}/user/auth', None, note='Logging in', fatal=False, + expected_status=400, headers=headers, data=json.dumps({ + 'username': username, + 'password': password, + }, separators=(',', ':')).encode()) or {} + if not login.get('auth_token'): if login.get('error') == 1008: raise ExtractorError('Unable to login: Incorrect password', expected=True) @@ -55,100 +58,100 @@ def _get_comments(self, video_id, limit=15): headers=self._API_HEADERS, query={'limit': limit}) or {} if not comment_info.get('comments'): return - for comment_dict in comment_info['comments']: - yield { - 'author': traverse_obj(comment_dict, ('author', 'username')), - 'author_id': traverse_obj(comment_dict, ('author', 'user_id')), - 'id': comment_dict.get('id'), - 'text': comment_dict.get('body'), - 'timestamp': unified_timestamp(comment_dict.get('timestamp')), - } + yield from traverse_obj(comment_info, ('comments', ..., { + 'id': ('id', {str_or_none}), + 'text': 'body', + 'author': ('author', 'username'), + 'author_id': ('author', 'user_id'), + 'timestamp': ('timestamp', {unified_timestamp}), + })) def _check_user_info(self, user_info): - if not user_info: - self.report_warning('Unable to extract user info') - elif user_info.get('private') and not user_info.get('followed_by_me'): + if user_info.get('private') and not user_info.get('followed_by_me'): raise ExtractorError('This video is private', expected=True) elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'): raise ExtractorError('The author of the video is blocked', expected=True) return user_info - def _parse_video_info(self, video_info, username, user_info=None): - video_uuid = video_info.get('video_uuid') - video_id = video_info.get('id') + def _parse_video_info(self, video_info, username, user_id, display_id=None): + video_id = str(video_info['id']) + display_id = display_id or video_info.get('video_uuid') + + if traverse_obj(video_info, ( + None, ('transcoded_url', 'video_url', 'stream_url', 'audio_url'), + {lambda x: re.search(r'/copyright/', x)}), get_all=False): + self.raise_no_formats('This video has been removed due to licensing restrictions', expected=True) + + def format_info(url): + return { + 'url': url, + 'ext': determine_ext(url), + 'format_id': url_basename(url).split('.')[0], + } formats = [] - video_url = traverse_obj(video_info, 'video_url', 'stream_url') - if video_url: + + if determine_ext(video_info.get('transcoded_url')) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_info['transcoded_url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + + for video in traverse_obj(video_info, ('video_set', lambda _, v: url_or_none(v['url']))): formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'vcodec': 'h264', - 'width': video_info.get('width'), - 'height': video_info.get('height'), - 'format_id': url_basename(video_url).split('.')[0], - 'filesize': video_info.get('filesize'), - }) - video_set = video_info.get('video_set') or [] - for video in video_set: - resolution = video.get('resolution') or '' - formats.append({ - 'url': video['url'], - 'ext': 'mp4', + **format_info(video['url']), + **parse_resolution(video.get('resolution')), 'vcodec': video.get('codec'), 'vbr': int_or_none(video.get('bitrate'), 1000), - 'width': int_or_none(resolution.split('x')[0]), - 'height': int_or_none(resolution.split('x')[1]), - 'format_id': url_basename(video['url']).split('.')[0], }) - audio_url = video_info.get('audio_url') - if audio_url: + + video_url = traverse_obj(video_info, 'video_url', 'stream_url', expected_type=url_or_none) + if video_url: formats.append({ - 'url': audio_url, - 'ext': 'm4a', - 'format_id': url_basename(audio_url).split('.')[0], + **format_info(video_url), + 'vcodec': 'h264', + **traverse_obj(video_info, { + 'width': 'width', + 'height': 'height', + 'filesize': 'filesize', + }, expected_type=int_or_none), }) - manifest_url = video_info.get('transcoded_url') - if manifest_url: - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + audio_url = url_or_none(video_info.get('audio_url')) + if audio_url: + formats.append(format_info(audio_url)) - comment_count = int_or_none(video_info.get('comment_count')) - - user_info = user_info or traverse_obj(video_info, 'user', default={}) + comment_count = traverse_obj(video_info, ('comment_count', {int_or_none})) return { - 'id': str_or_none(video_id) or video_uuid, - 'title': video_info.get('description') or f'Video by {username}', - 'thumbnail': video_info.get('thumbnail_url'), - 'description': video_info.get('description'), - 'uploader': str_or_none(username), - 'uploader_id': str_or_none(user_info.get('user_id')), - 'creator': str_or_none(user_info.get('name')), - 'timestamp': unified_timestamp(video_info.get('timestamp')), - 'upload_date': unified_strdate(video_info.get('timestamp')), - 'duration': int_or_none(video_info.get('duration')), - 'view_count': int_or_none(video_info.get('play_count')), - 'like_count': int_or_none(video_info.get('likes_count')), - 'artist': str_or_none(video_info.get('song_artist')), - 'track': str_or_none(video_info.get('song_title')), - 'webpage_url': f'https://triller.co/@{username}/video/{video_uuid}', + 'id': video_id, + 'display_id': display_id, + 'uploader': username, + 'uploader_id': user_id or traverse_obj(video_info, ('user', 'user_id', {str_or_none})), + 'webpage_url': urljoin(f'https://triller.co/@{username}/video/', display_id), 'uploader_url': f'https://triller.co/@{username}', 'extractor_key': TrillerIE.ie_key(), 'extractor': TrillerIE.IE_NAME, 'formats': formats, 'comment_count': comment_count, '__post_extractor': self.extract_comments(video_id, comment_count), + **traverse_obj(video_info, { + 'title': ('description', {lambda x: x.replace('\r\n', ' ')}), + 'description': 'description', + 'creator': ((('user'), ('users', lambda _, v: str(v['user_id']) == user_id)), 'name'), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'timestamp': ('timestamp', {unified_timestamp}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'like_count': ('likes_count', {int_or_none}), + 'artist': 'song_artist', + 'track': 'song_title', + }, get_all=False), } class TrillerIE(TrillerBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?triller\.co/ - @(?P[\w\._]+)/video/ - (?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + @(?P[\w.]+)/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}) ''' _TESTS = [{ 'url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', @@ -165,16 +168,14 @@ class TrillerIE(TrillerBaseIE): 'timestamp': 1660598222, 'upload_date': '20220815', 'duration': 47, - 'height': 3840, - 'width': 2160, 'view_count': int, 'like_count': int, 'artist': 'Megan Thee Stallion', 'track': 'Her', - 'webpage_url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', 'uploader_url': 'https://triller.co/@theestallion', 'comment_count': int, - } + }, + 'skip': 'This video has been removed due to licensing restrictions', }, { 'url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', 'md5': '874055f462af5b0699b9dbb527a505a0', @@ -182,6 +183,7 @@ class TrillerIE(TrillerBaseIE): 'id': '71621339', 'ext': 'mp4', 'title': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', + 'display_id': '46c6fcfa-aa9e-4503-a50c-68444f44cddc', 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', 'description': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', 'uploader': 'charlidamelio', @@ -190,59 +192,75 @@ class TrillerIE(TrillerBaseIE): 'timestamp': 1660773354, 'upload_date': '20220817', 'duration': 16, - 'height': 1920, - 'width': 1080, 'view_count': int, 'like_count': int, 'artist': 'Dixie', 'track': 'Someone to Blame', - 'webpage_url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', 'uploader_url': 'https://triller.co/@charlidamelio', 'comment_count': int, - } + }, + }, { + 'url': 'https://triller.co/@theestallion/video/07f35f38-1f51-48e2-8c5f-f7a8e829988f', + 'md5': 'af7b3553e4b8bfca507636471ee2eb41', + 'info_dict': { + 'id': '71837829', + 'ext': 'mp4', + 'title': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio #womeninhiphop', + 'display_id': '07f35f38-1f51-48e2-8c5f-f7a8e829988f', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio\r\n #womeninhiphop', + 'uploader': 'theestallion', + 'uploader_id': '18992236', + 'creator': 'Megan Thee Stallion', + 'timestamp': 1662486178, + 'upload_date': '20220906', + 'duration': 30, + 'view_count': int, + 'like_count': int, + 'artist': 'Unknown', + 'track': 'Unknown', + 'uploader_url': 'https://triller.co/@theestallion', + 'comment_count': int, + }, }] def _real_extract(self, url): - username, video_uuid = self._match_valid_url(url).group('username', 'id') + username, display_id = self._match_valid_url(url).group('username', 'id') - video_info = traverse_obj(self._download_json( - f'{self._API_BASE_URL}/api/videos/{video_uuid}', - video_uuid, note='Downloading video info API JSON', - errnote='Unable to download video info API JSON', - headers=self._API_HEADERS), ('videos', 0)) - if not video_info: - raise ExtractorError('No video info found in API response') + video_info = self._download_json( + f'{self._API_BASE_URL}/api/videos/{display_id}', display_id, + headers=self._API_HEADERS)['videos'][0] - user_info = self._check_user_info(video_info.get('user') or {}) - return self._parse_video_info(video_info, username, user_info) + self._check_user_info(video_info.get('user') or {}) + + return self._parse_video_info(video_info, username, None, display_id) class TrillerUserIE(TrillerBaseIE): - _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P[\w\._]+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P[\w.]+)/?(?:$|[#?])' _TESTS = [{ - # first videos request only returns 2 videos 'url': 'https://triller.co/@theestallion', - 'playlist_mincount': 9, + 'playlist_mincount': 12, 'info_dict': { 'id': '18992236', 'title': 'theestallion', 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', - } + }, }, { 'url': 'https://triller.co/@charlidamelio', - 'playlist_mincount': 25, + 'playlist_mincount': 150, 'info_dict': { 'id': '1875551', 'title': 'charlidamelio', 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', - } + }, }] def _real_initialize(self): if not self._API_HEADERS.get('Authorization'): guest = self._download_json( - f'{self._API_BASE_URL}/user/create_guest', - None, note='Creating guest session', data=b'', headers=self._API_HEADERS, query={ + f'{self._API_BASE_URL}/user/create_guest', None, + note='Creating guest session', data=b'', headers=self._API_HEADERS, query={ 'platform': 'Web', 'app_version': '', }) @@ -251,44 +269,65 @@ def _real_initialize(self): self._API_HEADERS['Authorization'] = f'Bearer {guest["auth_token"]}' - def _extract_video_list(self, username, user_id, limit=6): - query = { - 'limit': limit, - } + def _entries(self, username, user_id, limit=6): + query = {'limit': limit} for page in itertools.count(1): - for retry in self.RetryManager(): - try: - video_list = self._download_json( - f'{self._API_BASE_URL}/api/users/{user_id}/videos', - username, note=f'Downloading user video list page {page}', - errnote='Unable to download user video list', headers=self._API_HEADERS, - query=query) - except ExtractorError as e: - if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: - retry.error = e - continue - raise - if not video_list.get('videos'): - break - yield from video_list['videos'] - query['before_time'] = traverse_obj(video_list, ('videos', -1, 'timestamp')) + videos = self._download_json( + f'{self._API_BASE_URL}/api/users/{user_id}/videos', + username, note=f'Downloading user video list page {page}', + headers=self._API_HEADERS, query=query) + + for video in traverse_obj(videos, ('videos', ...)): + yield self._parse_video_info(video, username, user_id) + + query['before_time'] = traverse_obj(videos, ('videos', -1, 'timestamp')) if not query['before_time']: break - def _entries(self, videos, username, user_info): - for video in videos: - yield self._parse_video_info(video, username, user_info) - def _real_extract(self, url): username = self._match_id(url) + user_info = self._check_user_info(self._download_json( f'{self._API_BASE_URL}/api/users/by_username/{username}', - username, note='Downloading user info', - errnote='Failed to download user info', headers=self._API_HEADERS).get('user', {})) + username, note='Downloading user info', headers=self._API_HEADERS)['user']) user_id = str_or_none(user_info.get('user_id')) - videos = self._extract_video_list(username, user_id) - thumbnail = user_info.get('avatar_url') + if not user_id: + raise ExtractorError('Unable to extract user ID') return self.playlist_result( - self._entries(videos, username, user_info), user_id, username, thumbnail=thumbnail) + self._entries(username, user_id), user_id, username, thumbnail=user_info.get('avatar_url')) + + +class TrillerShortIE(InfoExtractor): + _VALID_URL = r'https?://v\.triller\.co/(?P\w+)' + _TESTS = [{ + 'url': 'https://v.triller.co/WWZNWk', + 'md5': '5eb8dc2c971bd8cd794ec9e8d5e9d101', + 'info_dict': { + 'id': '66210052', + 'ext': 'mp4', + 'title': 'md5:2dfc89d154cd91a4a18cd9582ba03e16', + 'display_id': 'f4480e1f-fb4e-45b9-a44c-9e6c679ce7eb', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'md5:2dfc89d154cd91a4a18cd9582ba03e16', + 'uploader': 'statefairent', + 'uploader_id': '487545193', + 'creator': 'Official Summer Fair of LA', + 'timestamp': 1629655457, + 'upload_date': '20210822', + 'duration': 19, + 'view_count': int, + 'like_count': int, + 'artist': 'Unknown', + 'track': 'Unknown', + 'uploader_url': 'https://triller.co/@statefairent', + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + real_url = self._request_webpage(HEADRequest(url), self._match_id(url)).geturl() + if self.suitable(real_url): # Prevent infinite loop in case redirect fails + raise UnsupportedError(real_url) + return self.url_result(real_url) From 9be0fe1fd967f62cbf3c60bd14e1021a70abc147 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 26 Mar 2023 17:27:39 -0500 Subject: [PATCH 017/501] [extractor/nbc] Fix `NBCStations` direct mp4 formats (#6637) Authored by: bashonly --- yt_dlp/extractor/nbc.py | 57 ++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index b9f65e9270..ddc89a7c29 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -12,9 +12,13 @@ RegexNotFoundError, UserNotLive, clean_html, + determine_ext, + float_or_none, int_or_none, + mimetype2ext, parse_age_limit, parse_duration, + remove_end, smuggle_url, traverse_obj, try_get, @@ -22,7 +26,6 @@ unified_timestamp, update_url_query, url_basename, - xpath_attr, ) @@ -660,6 +663,7 @@ class NBCStationsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', + 'duration': 112.513, 'timestamp': 1661135892, 'upload_date': '20220822', 'uploader': 'NBC 4', @@ -676,6 +680,7 @@ class NBCStationsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Huracán complica que televidente de Tucson reciba reembolso', 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'duration': 172.406, 'timestamp': 1660886507, 'upload_date': '20220819', 'uploader': 'Telemundo Arizona', @@ -685,6 +690,22 @@ class NBCStationsIE(InfoExtractor): 'params': { 'skip_download': 'm3u8', }, + }, { + # direct mp4 link + 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/', + 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85', + 'info_dict': { + 'id': '2961135', + 'ext': 'mp4', + 'title': 'Highs Near Freezing in Boston on Wednesday', + 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b', + 'duration': 235.669, + 'timestamp': 1675268656, + 'upload_date': '20230201', + 'uploader': '', + 'channel_id': 'WBTS', + 'channel': 'nbcboston', + }, }] _RESOLUTIONS = { @@ -711,7 +732,7 @@ def _real_extract(self, url): if not video_data: raise ExtractorError('No video metadata found in webpage', expected=True) - info, formats, subtitles = {}, [], {} + info, formats = {}, [] is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1 query = { 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3', @@ -747,13 +768,14 @@ def _real_extract(self, url): video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False) if video_url: + ext = determine_ext(video_url) height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None) formats.append({ 'url': video_url, - 'ext': 'mp4', + 'ext': ext, 'width': int_or_none(self._RESOLUTIONS.get(height)), 'height': int_or_none(height), - 'format_id': 'http-mp4', + 'format_id': f'http-{ext}', }) info.update({ @@ -770,14 +792,25 @@ def _real_extract(self, url): smil = self._download_xml( f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, note='Downloading SMIL data', query=query, fatal=is_live) - if smil: - manifest_url = xpath_attr(smil, f'.//{{{default_ns}}}video', 'src', fatal=is_live) - subtitles = self._parse_smil_subtitles(smil, default_ns) - fmts, subs = self._extract_m3u8_formats_and_subtitles( - manifest_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, - live=is_live, errnote='No HLS formats found') - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) + subtitles = self._parse_smil_subtitles(smil, default_ns) if smil else {} + for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil else []: + info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000) + video_src_url = video.get('src') + ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url)) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, + live=is_live, errnote='No HLS formats found') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif video_src_url: + formats.append({ + 'url': video_src_url, + 'format_id': f'https-{ext}', + 'ext': ext, + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) if not formats: self.raise_no_formats('No video content found in webpage', expected=True) From 95a383be1b6fb00c92ee3fb091732c4f6009acb6 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Mon, 27 Mar 2023 22:39:55 +0900 Subject: [PATCH 018/501] [extractor/iwara] Report private videos (#6641) Authored by: Lesmiscore --- yt_dlp/extractor/iwara.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 62a179700a..23f92786fc 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, OnDemandPagedList, int_or_none, mimetype2ext, @@ -75,7 +76,13 @@ def _extract_formats(self, video_id, fileurl): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(f'http://api.iwara.tv/video/{video_id}', video_id) + video_data = self._download_json(f'http://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True) + errmsg = video_data.get('message') + # at this point we can actually get uploaded user info, but do we need it? + if errmsg == 'errors.privateVideo': + self.raise_login_required('Private video. Login if you have permissions to watch') + elif errmsg: + raise ExtractorError(f'Iwara says: {errmsg}') return { 'id': video_id, From 0f0875ed555514f32522a0f30554fb08825d5124 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Tue, 28 Mar 2023 01:17:42 +0900 Subject: [PATCH 019/501] [postprocessor/EmbedThumbnail,postprocessor/FFmpegMetadata] Fix error on attaching thumbnails and info json for mkv/mka (#6647) Authored by: Lesmiscore Current yt-dlp code never hit this bug, but would hit once filename sanitization gets better --- yt_dlp/postprocessor/embedthumbnail.py | 2 +- yt_dlp/postprocessor/ffmpeg.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index b02d9d499d..88a767132a 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -107,7 +107,7 @@ def run(self, info): options.extend(['-map', '-0:%d' % old_stream]) new_stream -= 1 options.extend([ - '-attach', thumbnail_filename, + '-attach', self._ffmpeg_filename_argument(thumbnail_filename), '-metadata:s:%d' % new_stream, 'mimetype=%s' % mimetype, '-metadata:s:%d' % new_stream, 'filename=cover.%s' % thumbnail_ext]) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 0e8f4c70b1..63fc9ace65 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -809,7 +809,7 @@ def _get_infojson_opts(self, info, infofn): new_stream -= 1 yield ( - '-attach', infofn, + '-attach', self._ffmpeg_filename_argument(infofn), f'-metadata:s:{new_stream}', 'mimetype=application/json', f'-metadata:s:{new_stream}', 'filename=info.json', ) From ab92d8651c48d247dfb7d3f0a824cc986e47c7ed Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Wed, 29 Mar 2023 15:28:29 +0900 Subject: [PATCH 020/501] [extractor/iwara] Accept old URLs Authored by: Lesmiscore Closes #6669 --- yt_dlp/extractor/iwara.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 23f92786fc..ae2960af00 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -15,7 +15,7 @@ class IwaraIE(InfoExtractor): IE_NAME = 'iwara' - _VALID_URL = r'https?://(?:www\.)?iwara\.tv/video/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P[a-zA-Z0-9]+)' _TESTS = [{ # this video cannot be played because of migration 'only_matching': True, From 68be95bd0ca3f76aa63c9812935bd826b3a42e53 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Fri, 31 Mar 2023 11:56:49 +0900 Subject: [PATCH 021/501] [extractor/YahooGyaOIE,extactor/YahooGyaOPlayerIE] Delete extractors due to website close (#6218) Authored by: Lesmiscore --- yt_dlp/extractor/_extractors.py | 2 - yt_dlp/extractor/yahoo.py | 117 -------------------------------- 2 files changed, 119 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a97c458fa6..77a3c2ce97 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2343,8 +2343,6 @@ from .yahoo import ( YahooIE, YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, YahooJapanNewsIE, ) from .yandexdisk import YandexDiskIE diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index a69715b7c1..24148a0bd3 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -2,7 +2,6 @@ import itertools import urllib.parse -from .brightcove import BrightcoveNewIE from .common import InfoExtractor, SearchInfoExtractor from .youtube import YoutubeIE from ..utils import ( @@ -11,7 +10,6 @@ int_or_none, mimetype2ext, parse_iso8601, - smuggle_url, traverse_obj, try_get, url_or_none, @@ -337,121 +335,6 @@ def _search_results(self, query): break -class YahooGyaOPlayerIE(InfoExtractor): - IE_NAME = 'yahoo:gyao:player' - _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/c/y)/(?P\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', - 'info_dict': { - 'id': '5993125228001', - 'ext': 'mp4', - 'title': 'フューリー 【字幕版】', - 'description': 'md5:21e691c798a15330eda4db17a8fe45a5', - 'uploader_id': '4235717419001', - 'upload_date': '20190124', - 'timestamp': 1548294365, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/episode/5fa1226c-ef8d-4e93-af7a-fd92f4e30597', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url).replace('/', ':') - headers = self.geo_verification_headers() - headers['Accept'] = 'application/json' - resp = self._download_json( - 'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={ - 'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-', - 'query': '''{ - content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) { - video { - delivery { - id - } - title - } - } -}''' % video_id, - }, headers=headers) - content = resp['data']['content'] - if not content: - msg = resp['errors'][0]['message'] - if msg == 'not in japan': - self.raise_geo_restricted(countries=['JP']) - raise ExtractorError(msg) - video = content['video'] - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': video['title'], - 'url': smuggle_url( - 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'], - {'geo_countries': ['JP']}), - 'ie_key': BrightcoveNewIE.ie_key(), - } - - -class YahooGyaOIE(InfoExtractor): - IE_NAME = 'yahoo:gyao' - _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f', - 'info_dict': { - 'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f', - }, - 'playlist_mincount': 80, - }, { - 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', - 'only_matching': True, - }, { - 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf', - 'only_matching': True, - }] - - def _entries(self, program_id): - page = 1 - while True: - playlist = self._download_json( - f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}&serviceId=gy', program_id, - note=f'Downloading JSON metadata page {page}') - if not playlist: - break - for video in playlist['videos']: - video_id = video.get('id') - if not video_id: - continue - if video.get('streamingAvailability') == 'notYet': - continue - yield self.url_result( - 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), - YahooGyaOPlayerIE.ie_key(), video_id) - if playlist.get('ended'): - break - page += 1 - - def _real_extract(self, url): - program_id = self._match_id(url).replace('/', ':') - return self.playlist_result(self._entries(program_id), program_id) - - class YahooJapanNewsIE(InfoExtractor): IE_NAME = 'yahoo:japannews' IE_DESC = 'Yahoo! Japan News' From 141a8dff98874a426d7fbe772e0a8421bb42656f Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Thu, 6 Apr 2023 19:44:22 +1200 Subject: [PATCH 022/501] [extractor/youtube] Fix comment loop detection for pinned comments (#6714) Pinned comments may repeat a second time - this is expected. Fixes https://github.com/yt-dlp/yt-dlp/issues/6712 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ca56f112bb..6dc36f9b99 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3316,9 +3316,17 @@ def extract_thread(contents): comment = self._extract_comment(comment_renderer, parent) if not comment: continue + is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge')) + comment_id = comment['id'] + if is_pinned: + tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. # See: https://github.com/yt-dlp/yt-dlp/issues/6290 - if comment['id'] in tracker['seen_comment_ids']: + if comment_id in tracker['seen_comment_ids']: + if comment_id in tracker['pinned_comment_ids'] and not is_pinned: + # Pinned comments may appear a second time in newest first sort + # See: https://github.com/yt-dlp/yt-dlp/issues/6712 + continue self.report_warning('Detected YouTube comments looping. Stopping comment extraction as we probably cannot get any more.') yield else: @@ -3348,7 +3356,9 @@ def extract_thread(contents): current_page_thread=0, total_parent_comments=0, total_reply_comments=0, - seen_comment_ids=set()) + seen_comment_ids=set(), + pinned_comment_ids=set() + ) # TODO: Deprecated # YouTube comments have a max depth of 2 From 0a6918a4a1431960181d8c50e0bbbcb0afbaff9a Mon Sep 17 00:00:00 2001 From: bashonly Date: Sat, 8 Apr 2023 11:09:05 -0500 Subject: [PATCH 023/501] [extractor/kick] Make initial request non-fatal Authored by: bashonly --- yt_dlp/extractor/kick.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index a79ffb7a98..765ffa0c80 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -14,7 +14,7 @@ class KickBaseIE(InfoExtractor): def _real_initialize(self): - self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session') + self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False) xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') if not xsrf_token: self.write_debug('kick.com did not set XSRF-TOKEN cookie') From ef0848abd425dfda6db62baa8d72897eefb0007f Mon Sep 17 00:00:00 2001 From: Chris Caruso Date: Tue, 11 Apr 2023 04:45:22 -0700 Subject: [PATCH 024/501] [extractor/youku] Improve error message (#6690) Authored by: carusocr Closes #6551 --- yt_dlp/extractor/youku.py | 44 +++------------------------------------ 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index 404f196f46..7ecd9f1839 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, get_element_by_class, js_to_json, str_or_none, @@ -26,48 +27,8 @@ class YoukuIE(InfoExtractor): ''' _TESTS = [{ - # MD5 is unstable - 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', - 'info_dict': { - 'id': 'XMTc1ODE5Njcy', - 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', - 'ext': 'mp4', - 'duration': 74.73, - 'thumbnail': r're:^https?://.*', - 'uploader': '。躲猫猫、', - 'uploader_id': '36017967', - 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', - 'tags': list, - } - }, { 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', 'only_matching': True, - }, { - 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', - 'info_dict': { - 'id': 'XODgxNjg1Mzk2', - 'ext': 'mp4', - 'title': '武媚娘传奇 85', - 'duration': 1999.61, - 'thumbnail': r're:^https?://.*', - 'uploader': '疯狂豆花', - 'uploader_id': '62583473', - 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', - 'tags': list, - }, - }, { - 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', - 'info_dict': { - 'id': 'XMTI1OTczNDM5Mg', - 'ext': 'mp4', - 'title': '花千骨 04', - 'duration': 2363, - 'thumbnail': r're:^https?://.*', - 'uploader': '放剧场-花千骨', - 'uploader_id': '772849359', - 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', - 'tags': list, - }, }, { 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', 'note': 'Video protected with password', @@ -81,6 +42,7 @@ class YoukuIE(InfoExtractor): 'uploader_id': '322014285', 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', 'tags': list, + 'skip': '404', }, 'params': { 'videopassword': '100600', @@ -192,7 +154,7 @@ def _real_extract(self, url): else: msg = 'Youku server reported error %i' % error.get('code') if error_note is not None: - msg += ': ' + error_note + msg += ': ' + clean_html(error_note) raise ExtractorError(msg) # get video title From 7e35526d5b970a034b9d76215ee3e4bd7631edcd Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" Date: Tue, 11 Apr 2023 13:54:49 +0200 Subject: [PATCH 025/501] [extractor/hrefli] Add extractor (#6762) Authored by: selfisekai --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/hrefli.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 yt_dlp/extractor/hrefli.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 77a3c2ce97..808b558d18 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -734,6 +734,7 @@ ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrefli import HrefLiRedirectIE from .hrfensehen import HRFernsehenIE from .hrti import ( HRTiIE, diff --git a/yt_dlp/extractor/hrefli.py b/yt_dlp/extractor/hrefli.py new file mode 100644 index 0000000000..77db2ea687 --- /dev/null +++ b/yt_dlp/extractor/hrefli.py @@ -0,0 +1,15 @@ +from .common import InfoExtractor + + +class HrefLiRedirectIE(InfoExtractor): + IE_NAME = 'href.li' + IE_DESC = False # Do not list + _VALID_URL = r'https?://href\.li/\?(?P.+)' + + _TESTS = [{ + 'url': 'https://href.li/?https://www.reddit.com/r/cats/comments/12bluel/my_cat_helps_me_with_water/?utm_source=share&utm_medium=android_app&utm_name=androidcss&utm_term=1&utm_content=share_button', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result(self._match_valid_url(url).group('url')) From faa0332ed69e070cf3bd31390589a596e962f392 Mon Sep 17 00:00:00 2001 From: sian1468 <58017832+sian1468@users.noreply.github.com> Date: Tue, 11 Apr 2023 18:56:39 +0700 Subject: [PATCH 026/501] [extractor/line] Remove extractors (#6734) Service has shut down - https://archive.ph/txVKy Authored by: sian1468 --- yt_dlp/extractor/_extractors.py | 4 - yt_dlp/extractor/line.py | 143 -------------------------------- 2 files changed, 147 deletions(-) delete mode 100644 yt_dlp/extractor/line.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 808b558d18..5f4ae7b8df 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -944,10 +944,6 @@ LimelightChannelIE, LimelightChannelListIE, ) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) from .linkedin import ( LinkedInIE, LinkedInLearningIE, diff --git a/yt_dlp/extractor/line.py b/yt_dlp/extractor/line.py deleted file mode 100644 index 3fab9c8a5d..0000000000 --- a/yt_dlp/extractor/line.py +++ /dev/null @@ -1,143 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - format_field, - int_or_none, - str_or_none, -) - - -class LineLiveBaseIE(InfoExtractor): - _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' - - def _parse_broadcast_item(self, item): - broadcast_id = compat_str(item['id']) - title = item['title'] - is_live = item.get('isBroadcastingNow') - - thumbnails = [] - for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - }) - - channel = item.get('channel') or {} - channel_id = str_or_none(channel.get('id')) - - return { - 'id': broadcast_id, - 'title': title, - 'thumbnails': thumbnails, - 'timestamp': int_or_none(item.get('createdAt')), - 'channel': channel.get('name'), - 'channel_id': channel_id, - 'channel_url': format_field(channel_id, None, 'https://live.line.me/channels/%s'), - 'duration': int_or_none(item.get('archiveDuration')), - 'view_count': int_or_none(item.get('viewerCount')), - 'comment_count': int_or_none(item.get('chatCount')), - 'is_live': is_live, - } - - -class LineLiveIE(LineLiveBaseIE): - _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)/broadcast/(?P\d+)' - _TESTS = [{ - 'url': 'https://live.line.me/channels/5833718/broadcast/18373277', - 'md5': '2c15843b8cb3acd55009ddcb2db91f7c', - 'info_dict': { - 'id': '18373277', - 'title': '2021/12/05 (15分犬)定例譲渡会🐶', - 'ext': 'mp4', - 'timestamp': 1638674925, - 'upload_date': '20211205', - 'thumbnail': 'md5:e1f5817e60f4a72b7e43377cf308d7ef', - 'channel_url': 'https://live.line.me/channels/5833718', - 'channel': 'Yahooニュース掲載🗞プロフ見てね🐕🐕', - 'channel_id': '5833718', - 'duration': 937, - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - }, { - # archiveStatus == 'DELETED' - 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id, broadcast_id = self._match_valid_url(url).groups() - broadcast = self._download_json( - self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), - broadcast_id) - item = broadcast['item'] - info = self._parse_broadcast_item(item) - protocol = 'm3u8' if info['is_live'] else 'm3u8_native' - formats = [] - for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): - if not v: - continue - if k == 'abr': - formats.extend(self._extract_m3u8_formats( - v, broadcast_id, 'mp4', protocol, - m3u8_id='hls', fatal=False)) - continue - f = { - 'ext': 'mp4', - 'format_id': 'hls-' + k, - 'protocol': protocol, - 'url': v, - } - if not k.isdigit(): - f['vcodec'] = 'none' - formats.append(f) - if not formats: - archive_status = item.get('archiveStatus') - if archive_status != 'ARCHIVED': - self.raise_no_formats('this video has been ' + archive_status.lower(), expected=True) - info['formats'] = formats - return info - - -class LineLiveChannelIE(LineLiveBaseIE): - _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' - _TEST = { - 'url': 'https://live.line.me/channels/5893542', - 'info_dict': { - 'id': '5893542', - 'title': 'いくらちゃんだよぉ🦒', - 'description': 'md5:4d418087973ad081ceb1b3481f0b1816', - }, - 'playlist_mincount': 29 - } - - def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): - while True: - for row in (archived_broadcasts.get('rows') or []): - share_url = str_or_none(row.get('shareURL')) - if not share_url: - continue - info = self._parse_broadcast_item(row) - info.update({ - '_type': 'url', - 'url': share_url, - 'ie_key': LineLiveIE.ie_key(), - }) - yield info - if not archived_broadcasts.get('hasNextPage'): - return - archived_broadcasts = self._download_json( - self._API_BASE_URL + channel_id + '/archived_broadcasts', - channel_id, query={ - 'lastId': info['id'], - }) - - def _real_extract(self, url): - channel_id = self._match_id(url) - channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) - return self.playlist_result( - self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), - channel_id, channel.get('title'), channel.get('information')) From 79c77e85b70ae3b9942d5a88c14d021a9bd24222 Mon Sep 17 00:00:00 2001 From: Shreyas Minocha <11537232+shreyasminocha@users.noreply.github.com> Date: Tue, 11 Apr 2023 16:05:22 +0000 Subject: [PATCH 027/501] [extractor/zoom] Fix extractor (#6741) Authored by: shreyasminocha Closes #6677 --- yt_dlp/extractor/zoom.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index ef8b71522c..eb0ab795bc 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -5,6 +5,7 @@ str_or_none, js_to_json, parse_filesize, + traverse_obj, urlencode_postdata, urljoin, ) @@ -53,6 +54,9 @@ def _real_extract(self, url): r'(?s)window\.__data__\s*=\s*({.+?});', webpage, 'data'), play_id, js_to_json) + data = self._download_json( + f'{base_url}nws/recording/1.0/play/info/{data["fileId"]}', play_id)['result'] + subtitles = {} for _type in ('transcript', 'cc', 'chapter'): if data.get('%sUrl' % _type): @@ -67,11 +71,11 @@ def _real_extract(self, url): formats.append({ 'format_note': 'Camera stream', 'url': str_or_none(data.get('viewMp4Url')), - 'width': int_or_none(data.get('viewResolvtionsWidth')), - 'height': int_or_none(data.get('viewResolvtionsHeight')), - 'format_id': str_or_none(data.get('recordingId')), + 'width': int_or_none(traverse_obj(data, ('viewResolvtions', 0))), + 'height': int_or_none(traverse_obj(data, ('viewResolvtions', 1))), + 'format_id': str_or_none(traverse_obj(data, ('recording', 'id'))), 'ext': 'mp4', - 'filesize_approx': parse_filesize(data.get('fileSize')), + 'filesize_approx': parse_filesize(str_or_none(traverse_obj(data, ('recording', 'fileSizeInMB')))), 'preference': 0 }) @@ -79,16 +83,16 @@ def _real_extract(self, url): formats.append({ 'format_note': 'Screen share stream', 'url': str_or_none(data.get('shareMp4Url')), - 'width': int_or_none(data.get('shareResolvtionsWidth')), - 'height': int_or_none(data.get('shareResolvtionsHeight')), - 'format_id': str_or_none(data.get('shareVideoId')), + 'width': int_or_none(traverse_obj(data, ('shareResolvtions', 0))), + 'height': int_or_none(traverse_obj(data, ('shareResolvtions', 1))), + 'format_id': str_or_none(traverse_obj(data, ('shareVideo', 'id'))), 'ext': 'mp4', 'preference': -1 }) return { 'id': play_id, - 'title': data.get('topic'), + 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), 'subtitles': subtitles, 'formats': formats, 'http_headers': { From c6786ff3baaf72a5baa4d56d34058e54cbcf8ceb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 11 Apr 2023 16:11:15 +0530 Subject: [PATCH 028/501] [extractor/youtube] Revert default formats to `https` --- yt_dlp/extractor/youtube.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6dc36f9b99..d6a55e9532 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3789,15 +3789,13 @@ def build_fragments(f): if single_stream and dct.get('ext'): dct['container'] = dct['ext'] + '_dash' - if dct['filesize']: + if all_formats and dct['filesize']: yield { **dct, 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], 'protocol': 'http_dash_segments', 'fragments': build_fragments(dct), } - if not all_formats: - continue dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} yield dct From 26010b5cec50193b98ad7845d1d77450f9f14c2b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 11 Apr 2023 17:01:22 +0530 Subject: [PATCH 029/501] [postprocessor/FixupDuplicateMoov] Fix bug in triggering --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a7dced8e88..0d987dbb87 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3312,7 +3312,7 @@ def ffmpeg_fixup(cndn, msg, cls): or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) - ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', + ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments', 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) From 52ecc33e221f7de7eb6fed6c22489f0c5fdd2c6d Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Wed, 12 Apr 2023 01:19:34 +0900 Subject: [PATCH 030/501] [extractor/niconico] Download comments from the new endpoint (#6773) Authored by: Lesmiscore --- yt_dlp/extractor/niconico.py | 54 ++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 9c3a5a4bc8..cacefeb429 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -477,23 +477,32 @@ def _get_subtitles(self, video_id, api_data, session_api_data): user_id_str = session_api_data.get('serviceUserId') thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive'])) - raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) - if not raw_danmaku: + legacy_danmaku = self._extract_legacy_comments(video_id, thread_ids, user_id_str, comment_user_key) or [] + + new_comments = traverse_obj(api_data, ('comment', 'nvComment')) + new_danmaku = self._extract_new_comments( + new_comments.get('server'), video_id, + new_comments.get('params'), new_comments.get('threadKey')) + + if not legacy_danmaku and not new_danmaku: self.report_warning(f'Failed to get comments. {bug_reports_message()}') return + return { 'comments': [{ 'ext': 'json', - 'data': json.dumps(raw_danmaku), + 'data': json.dumps(legacy_danmaku + new_danmaku), }], } - def _extract_all_comments(self, video_id, threads, user_id, user_key): + def _extract_legacy_comments(self, video_id, threads, user_id, user_key): auth_data = { 'user_id': user_id, 'userkey': user_key, } if user_id and user_key else {'user_id': ''} + api_url = traverse_obj(threads, (..., 'server'), get_all=False) + # Request Start post_data = [{'ping': {'content': 'rs:0'}}] for i, thread in enumerate(threads): @@ -532,17 +541,32 @@ def _extract_all_comments(self, video_id, threads, user_id, user_key): # Request Final post_data.append({'ping': {'content': 'rf:0'}}) - for api_url in self._COMMENT_API_ENDPOINTS: - comments = self._download_json( - api_url, video_id, data=json.dumps(post_data).encode(), fatal=False, - headers={ - 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id, - 'Origin': 'https://www.nicovideo.jp', - 'Content-Type': 'text/plain;charset=UTF-8', - }, - note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') - if comments: - return comments + return self._download_json( + f'{api_url}/api.json', video_id, data=json.dumps(post_data).encode(), fatal=False, + headers={ + 'Referer': f'https://www.nicovideo.jp/watch/{video_id}', + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + }, + note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') + + def _extract_new_comments(self, endpoint, video_id, params, thread_key): + comments = self._download_json( + f'{endpoint}/v1/threads', video_id, data=json.dumps({ + 'additionals': {}, + 'params': params, + 'threadKey': thread_key, + }).encode(), fatal=False, + headers={ + 'Referer': 'https://www.nicovideo.jp/', + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + 'x-client-os-type': 'others', + 'x-frontend-id': '6', + 'x-frontend-version': '0', + }, + note='Downloading comments (new)', errnote='Failed to download comments (new)') + return traverse_obj(comments, ('data', 'threads', ..., 'comments', ...)) class NiconicoPlaylistBaseIE(InfoExtractor): From c3f624ef0a5d7a6ae1c5ffeb243087e9fc7d79dc Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 12 Apr 2023 05:04:47 +0530 Subject: [PATCH 031/501] Relaxed validation for numeric format filters Continued from f96bff99cb2cf1d112b099e5149dd2c3a6a76af2 Closes #6782 --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 0d987dbb87..7b6fef2041 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1932,7 +1932,7 @@ def _build_format_filter(self, filter_spec): '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* - (?Pwidth|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s* + (?P[\w.-]+)\s* (?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s* ''' % '|'.join(map(re.escape, OPERATORS.keys()))) From 2d97d154fe4fb84fe2ed3a4e1ed5819e89b71e88 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Thu, 13 Apr 2023 03:19:08 +0900 Subject: [PATCH 032/501] [extractor/gmanetwork] Add extractor (#5945) Authored by: HobbyistDev Partially fixes #5770 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/gmanetwork.py | 83 +++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 yt_dlp/extractor/gmanetwork.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 5f4ae7b8df..c2043bbd22 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -681,6 +681,7 @@ GloboIE, GloboArticleIE, ) +from .gmanetwork import GMANetworkVideoIE from .go import GoIE from .godtube import GodTubeIE from .gofile import GofileIE diff --git a/yt_dlp/extractor/gmanetwork.py b/yt_dlp/extractor/gmanetwork.py new file mode 100644 index 0000000000..62fff4eadc --- /dev/null +++ b/yt_dlp/extractor/gmanetwork.py @@ -0,0 +1,83 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from .youtube import YoutubeIE + + +class GMANetworkVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P\d+)/(?P[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home', + 'info_dict': { + 'id': '28BqW0AXPe0', + 'ext': 'mp4', + 'upload_date': '20220919', + 'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'like_count': int, + 'view_count': int, + 'uploader': 'YoüLOL', + 'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'duration': 5313, + 'comment_count': int, + 'tags': 'count:22', + 'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)', + 'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg', + 'release_timestamp': 1663594212, + 'age_limit': 0, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'description': 'md5:811bdcea74f9c48051824e494756e926', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'YoüLOL', + 'availability': 'public', + 'release_date': '20220919', + } + }, { + 'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home', + 'info_dict': { + 'id': 'yiDOExw2aSA', + 'ext': 'mp4', + 'live_status': 'not_live', + 'channel': 'GMANetwork', + 'like_count': int, + 'channel_follower_count': int, + 'description': 'md5:6d00cd658394fa1a5071200d3ed4be05', + 'duration': 1419, + 'age_limit': 0, + 'comment_count': int, + 'upload_date': '20181003', + 'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng', + 'title': 'More Than Words: Full Episode 80 (Finale)', + 'uploader_id': 'GMANETWORK', + 'categories': ['Entertainment'], + 'uploader': 'GMANetwork', + 'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng', + 'tags': 'count:29', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/user/GMANETWORK', + } + }] + + def _real_extract(self, url): + content_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + # webpage route + youtube_id = self._search_regex( + r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P[\w-]+)', webpage, 'youtube_id', fatal=False) + if youtube_id: + return self.url_result(youtube_id, YoutubeIE, youtube_id) + + # api call route + # more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11 + network_url = self._search_regex( + r'NETWORK_URL\s*=\s*[\'"](?P[^\'"]+)', webpage, 'network_url') + json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id) + if json_data.get('video_file'): + return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file']) + else: + return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file']) From b093c38cc9f26b59a8504211d792f053142c847d Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Thu, 13 Apr 2023 03:21:57 +0900 Subject: [PATCH 033/501] [extractor/biliIntl] Add comment extraction (#6079) Authored by: HobbyistDev --- yt_dlp/extractor/bilibili.py | 111 ++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index c344397792..91d436dd85 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -26,6 +26,7 @@ srt_subtitles_timecode, str_or_none, traverse_obj, + unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, @@ -996,6 +997,53 @@ class BiliIntlIE(BiliIntlBaseIE): 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', 'upload_date': '20221212', 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', + }, + }, { + # episode comment extraction + 'url': 'https://www.bilibili.tv/en/play/34580/340317', + 'info_dict': { + 'id': '340317', + 'ext': 'mp4', + 'timestamp': 1604057820, + 'upload_date': '20201030', + 'episode_number': 5, + 'title': 'E5 - My Own Steel', + 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', + 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode': 'Episode 5', + 'comment_count': int, + 'chapters': [{ + 'start_time': 0, + 'end_time': 61.0, + 'title': '' + }, { + 'start_time': 61.0, + 'end_time': 134.0, + 'title': 'Intro' + }, { + 'start_time': 1290.0, + 'end_time': 1379.0, + 'title': 'Outro' + }], + }, + 'params': { + 'getcomments': True + } + }, { + # user generated content comment extraction + 'url': 'https://www.bilibili.tv/en/video/2045730385', + 'info_dict': { + 'id': '2045730385', + 'ext': 'mp4', + 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', + 'timestamp': 1667891924, + 'upload_date': '20221108', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'comment_count': int, + 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + }, + 'params': { + 'getcomments': True } }, { # episode id without intro and outro @@ -1055,11 +1103,69 @@ def _extract_video_metadata(self, url, video_id, season_id): # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found return merge_dicts( - self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), { + self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { 'title': self._html_search_meta('og:title', webpage), 'description': self._html_search_meta('og:description', webpage) }) + def _get_comments_reply(self, root_id, next_id=0, display_id=None): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/detail', display_id, + note=f'Downloading reply comment of {root_id} - {next_id}', + query={ + 'platform': 'web', + 'ps': 20, # comment's reply per page (default: 3) + 'root': root_id, + 'next': next_id, + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'parent': replies.get('parent'), + 'timestamp': unified_timestamp(replies.get('ctime_text')) + } + + if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + yield from self._get_comments_reply( + root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + + def _get_comments(self, video_id, ep_id): + for i in itertools.count(0): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/root', video_id, + note=f'Downloading comment page {i + 1}', + query={ + 'platform': 'web', + 'pn': i, # page number + 'ps': 20, # comment per page (default: 20) + 'oid': video_id, + 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content + 'sort_type': 1, # 1: best, 2: recent + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), + } + if replies.get('count'): + yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + + if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + break + def _real_extract(self, url): season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') video_id = ep_id or aid @@ -1087,7 +1193,8 @@ def _real_extract(self, url): **self._extract_video_metadata(url, video_id, season_id), 'formats': self._get_formats(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), - 'chapters': chapters + 'chapters': chapters, + '__post_extractor': self.extract_comments(video_id, ep_id) } From 979568f26ece80bca72b48f0dd57d676e431059a Mon Sep 17 00:00:00 2001 From: MyNey <20515340+MinePlayersPE@users.noreply.github.com> Date: Thu, 13 Apr 2023 01:28:33 +0700 Subject: [PATCH 034/501] [extractor/BrainPOP] Add extractors (#6106) Authored by: MinePlayersPE Based on https://github.com/ytdl-org/youtube-dl/pull/10025 --- yt_dlp/extractor/_extractors.py | 8 + yt_dlp/extractor/brainpop.py | 318 ++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 yt_dlp/extractor/brainpop.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c2043bbd22..09903423d8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -254,6 +254,14 @@ BRMediathekIE, ) from .bravotv import BravoTVIE +from .brainpop import ( + BrainPOPIE, + BrainPOPJrIE, + BrainPOPELLIE, + BrainPOPEspIE, + BrainPOPFrIE, + BrainPOPIlIE, +) from .breakcom import BreakIE from .breitbart import BreitBartIE from .brightcove import ( diff --git a/yt_dlp/extractor/brainpop.py b/yt_dlp/extractor/brainpop.py new file mode 100644 index 0000000000..1200437e63 --- /dev/null +++ b/yt_dlp/extractor/brainpop.py @@ -0,0 +1,318 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + classproperty, + int_or_none, + traverse_obj, + urljoin +) + + +class BrainPOPBaseIE(InfoExtractor): + _NETRC_MACHINE = 'brainpop' + _ORIGIN = '' # So that _VALID_URL doesn't crash + _LOGIN_ERRORS = { + 1502: 'The username and password you entered did not match.', # LOGIN_FAILED + 1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE + 1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED + 1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED + 1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE + 1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED + 1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP + 1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED + 1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE + 1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS + 1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD + 1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED + } + + @classproperty + def _VALID_URL(cls): + root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?') + return rf'{root}/(?P[^/]+/[^/]+/(?P[^/?#&]+))' + + def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}): + formats = [] + formats = self._extract_m3u8_formats( + f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}', + display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False) + formats.append({ + 'format_id': format_id, + 'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}', + }) + for f in formats: + f.update(extra_fields) + return formats + + def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}): + formats = [] + additional_key_formats = { + '%s': {}, + 'ad_%s': { + 'format_note': 'Audio description', + 'source_preference': -2 + } + } + for additional_key_format, additional_key_fields in additional_key_formats.items(): + for key_quality, key_index in enumerate(('high', 'low')): + full_key_index = additional_key_format % (key_format % key_index) + if data.get(full_key_index): + formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, { + 'quality': -1 - key_quality, + **additional_key_fields, + **extra_fields + })) + return formats + + def _perform_login(self, username, password): + login_res = self._download_json( + 'https://api.brainpop.com/api/login', None, + data=json.dumps({'username': username, 'password': password}).encode(), + headers={ + 'Content-Type': 'application/json', + 'Referer': self._ORIGIN + }, note='Logging in', errnote='Unable to log in', expected_status=400) + status_code = int_or_none(login_res['status_code']) + if status_code != 1505: + self.report_warning( + f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}' + or f'Got status code {status_code}') + + +class BrainPOPIE(BrainPOPBaseIE): + _ORIGIN = 'https://www.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com' + _TESTS = [{ + 'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null', + 'md5': '3ead374233ae74c7f1b0029a01c972f0', + 'info_dict': { + 'id': '1f3259fa457292b4', + 'ext': 'mp4', + 'title': 'Martin Luther King, Jr.', + 'display_id': 'martinlutherkingjr', + 'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349', + }, + }, { + 'url': 'https://www.brainpop.com/science/space/bigbang/', + 'md5': '9a1ff0e77444dd9e437354eb669c87ec', + 'info_dict': { + 'id': 'acae52cd48c99acf', + 'ext': 'mp4', + 'title': 'Big Bang', + 'display_id': 'bigbang', + 'description': 'md5:3e53b766b0f116f631b13f4cae185d38', + }, + 'skip': 'Requires login', + }] + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + movie_data = self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id, + 'Downloading movie data JSON', 'Unable to download movie data')['data'] + topic_data = traverse_obj(self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id, + 'Downloading topic data JSON', 'Unable to download topic data', fatal=False), + ('data', 'topic'), expected_type=dict) or movie_data['topic'] + + if not traverse_obj(movie_data, ('access', 'allow')): + reason = traverse_obj(movie_data, ('access', 'reason')) + if 'logged' in reason: + self.raise_login_required(reason, metadata_available=True) + else: + self.raise_no_formats(reason, video_id=display_id) + movie_feature = movie_data['feature'] + movie_feature_data = movie_feature['data'] + + formats, subtitles = [], {} + formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', { + 'language': movie_feature.get('language') or 'en', + 'language_preference': 10 + })) + for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items(): + formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', { + 'language': lang, + 'language_preference': -10 + })) + + # TODO: Do localization fields also have subtitles? + for name, url in movie_feature_data.items(): + lang = self._search_regex( + r'^subtitles_(?P\w+)$', name, 'subtitle metadata', default=None) + if lang and url: + subtitles.setdefault(lang, []).append({ + 'url': urljoin(self._CDN_URL, url) + }) + + return { + 'id': topic_data['topic_id'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BrainPOPLegacyBaseIE(BrainPOPBaseIE): + def _parse_js_topic_data(self, topic_data, display_id, token): + movie_data = topic_data['movies'] + # TODO: Are there non-burned subtitles? + formats = self._extract_adaptive_formats(movie_data, token, display_id) + + return { + 'id': topic_data['EntryID'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'alt_title': topic_data.get('title'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + } + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + webpage = self._download_webpage(url, display_id) + topic_data = self._search_json( + r'var\s+content\s*=\s*', webpage, 'content data', + display_id, end_pattern=';')['category']['unit']['topic'] + token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token') + return self._parse_js_topic_data(topic_data, display_id, token) + + +class BrainPOPJrIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://jr.brainpop.com' + _VIDEO_URL = 'https://svideos-jr.brainpop.com' + _HLS_URL = 'https://hls-jr.brainpop.com' + _CDN_URL = 'https://cdn-jr.brainpop.com' + _TESTS = [{ + 'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/', + 'md5': '04e0561bb21770f305a0ce6cf0d869ab', + 'info_dict': { + 'id': '347', + 'ext': 'mp4', + 'title': 'Emotions', + 'display_id': 'emotions', + }, + }, { + 'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/', + 'md5': 'b0ed063bbd1910df00220ee29340f5d6', + 'info_dict': { + 'id': '29', + 'ext': 'mp4', + 'title': 'Arctic Habitats', + 'display_id': 'arctichabitats', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPELLIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://ell.brainpop.com' + _VIDEO_URL = 'https://svideos-esl.brainpop.com' + _HLS_URL = 'https://hls-esl.brainpop.com' + _CDN_URL = 'https://cdn-esl.brainpop.com' + _TESTS = [{ + 'url': 'https://ell.brainpop.com/level1/unit1/lesson1/', + 'md5': 'a2012700cfb774acb7ad2e8834eed0d0', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': 'Lesson 1', + 'display_id': 'lesson1', + 'alt_title': 'Personal Pronouns', + }, + }, { + 'url': 'https://ell.brainpop.com/level3/unit6/lesson5/', + 'md5': 'be19c8292c87b24aacfb5fda2f3f8363', + 'info_dict': { + 'id': '101', + 'ext': 'mp4', + 'title': 'Lesson 5', + 'display_id': 'lesson5', + 'alt_title': 'Review: Unit 6', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPEspIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Español' + _ORIGIN = 'https://esp.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/mx' + _TESTS = [{ + 'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/', + 'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9', + 'info_dict': { + 'id': '3893', + 'ext': 'mp4', + 'title': 'Ecosistemas', + 'display_id': 'ecosistemas', + 'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3', + }, + }, { + 'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/', + 'md5': '98c1b9559e0e33777209c425cda7dac4', + 'info_dict': { + 'id': '7146', + 'ext': 'mp4', + 'title': 'Emily Dickinson', + 'display_id': 'emily_dickinson', + 'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPFrIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Français' + _ORIGIN = 'https://fr.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/fr' + _TESTS = [{ + 'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/', + 'md5': '97e7f48af8af93f8a2be11709f239371', + 'info_dict': { + 'id': '1651', + 'ext': 'mp4', + 'title': 'Sources d\'énergie', + 'display_id': 'sourcesdenergie', + 'description': 'md5:7eece350f019a21ef9f64d4088b2d857', + }, + }, { + 'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/', + 'md5': '0cf2b4f89804d0dd4a360a51310d445a', + 'info_dict': { + 'id': '5803', + 'ext': 'mp4', + 'title': 'Plagiat', + 'display_id': 'plagiat', + 'description': 'md5:4496d87127ace28e8b1eda116e77cd2b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPIlIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Hebrew' + _ORIGIN = 'https://il.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/he' + _TESTS = [{ + 'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/', + 'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641', + 'info_dict': { + 'id': '3782', + 'ext': 'mp4', + 'title': 'md5:e993632fcda0545d9205602ec314ad67', + 'display_id': 'subjects_3782', + 'description': 'md5:4cc084a8012beb01f037724423a4d4ed', + }, + }] From d1483ec693c79f0b4ddf493870bcb840aca4da08 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Thu, 13 Apr 2023 16:09:20 +0900 Subject: [PATCH 035/501] [extractor/iwara] Fix typo Authored by: Lesmiscore Closes #6795 --- yt_dlp/extractor/iwara.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index ae2960af00..9dbb141fd6 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -76,7 +76,7 @@ def _extract_formats(self, video_id, fileurl): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(f'http://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True) + video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True) errmsg = video_data.get('message') # at this point we can actually get uploaded user info, but do we need it? if errmsg == 'errors.privateVideo': From 56793f74c36899742d7abd52afb0deca97d469e1 Mon Sep 17 00:00:00 2001 From: hasezoey Date: Thu, 13 Apr 2023 19:17:56 +0200 Subject: [PATCH 036/501] [extractor/iwara] Fix format sorting (#6651) Authored by: hasezoey --- yt_dlp/extractor/iwara.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 9dbb141fd6..a5aad26ee8 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -8,6 +8,7 @@ OnDemandPagedList, int_or_none, mimetype2ext, + qualities, traverse_obj, unified_timestamp, ) @@ -64,13 +65,15 @@ def _extract_formats(self, video_id, fileurl): # https://github.com/yt-dlp/yt-dlp/issues/6549#issuecomment-1473771047 x_version = hashlib.sha1('_'.join((paths[-1], q['expires'][0], '5nFp9kmbNnHdAFhaqMvt')).encode()).hexdigest() + preference = qualities(['preview', '360', '540', 'Source']) + files = self._download_json(fileurl, video_id, headers={'X-Version': x_version}) for fmt in files: yield traverse_obj(fmt, { 'format_id': 'name', 'url': ('src', ('view', 'download'), {self._proto_relative_url}), 'ext': ('type', {mimetype2ext}), - 'quality': ('name', {lambda x: int_or_none(x) or 1e4}), + 'quality': ('name', {preference}), 'height': ('name', {int_or_none}), }, get_all=False) @@ -84,6 +87,11 @@ def _real_extract(self, url): elif errmsg: raise ExtractorError(f'Iwara says: {errmsg}') + if not video_data.get('fileUrl'): + if video_data.get('embedUrl'): + return self.url_result(video_data.get('embedUrl')) + raise ExtractorError('This video is unplayable', expected=True) + return { 'id': video_id, 'age_limit': 18 if video_data.get('rating') == 'ecchi' else 0, # ecchi is 'sexy' in Japanese From 90c1f5120694105496a6ad9e3ecfc6c25de6cae1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Apr 2023 13:56:12 -0500 Subject: [PATCH 037/501] [extractor/zoom] Fix share URL extraction (#6789) Authored by: bashonly --- yt_dlp/extractor/zoom.py | 90 +++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index eb0ab795bc..3d7ccca760 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -13,8 +13,8 @@ class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P[A-Za-z0-9_.-]+)' - _TEST = { + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?Pplay|share)/(?P[A-Za-z0-9_.-]+)' + _TESTS = [{ 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', 'info_dict': { @@ -23,39 +23,73 @@ class ZoomIE(InfoExtractor): 'title': 'China\'s "two sessions" and the new five-year plan', }, 'skip': 'Recording requires email authentication to access', - } + }, { + # play URL + 'url': 'https://ffgolf.zoom.us/rec/play/qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ', + 'md5': '2c4b1c4e5213ebf9db293e88d9385bee', + 'info_dict': { + 'id': 'qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ', + 'ext': 'mp4', + 'title': 'Prépa AF2023 - Séance 5 du 11 avril - R20/VM/GO', + }, + }, { + # share URL + 'url': 'https://us02web.zoom.us/rec/share/hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8', + 'md5': '90fdc7cfcaee5d52d1c817fc03c43c9b', + 'info_dict': { + 'id': 'hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8', + 'ext': 'mp4', + 'title': 'Timea Andrea Lelik\'s Personal Meeting Room', + }, + }] - def _real_extract(self, url): - base_url, play_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, play_id) + def _get_page_data(self, webpage, video_id): + return self._search_json( + r'window\.__data__\s*=', webpage, 'data', video_id, transform_source=js_to_json) + def _get_real_webpage(self, url, base_url, video_id, url_type): + webpage = self._download_webpage(url, video_id, note=f'Downloading {url_type} webpage') try: form = self._form_hidden_inputs('password_form', webpage) except ExtractorError: - form = None - if form: - password = self.get_param('videopassword') - if not password: - raise ExtractorError( - 'This video is protected by a passcode, use the --video-password option', expected=True) - is_meeting = form.get('useWhichPasswd') == 'meeting' - validation = self._download_json( - base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), - play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ - 'id': form[('meet' if is_meeting else 'file') + 'Id'], - 'passwd': password, - 'action': form.get('action'), - })) - if not validation.get('status'): - raise ExtractorError(validation['errorMessage'], expected=True) - webpage = self._download_webpage(url, play_id) + return webpage - data = self._parse_json(self._search_regex( - r'(?s)window\.__data__\s*=\s*({.+?});', - webpage, 'data'), play_id, js_to_json) + password = self.get_param('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + video_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + return self._download_webpage(url, video_id, note=f'Re-downloading {url_type} webpage') + + def _real_extract(self, url): + base_url, url_type, video_id = self._match_valid_url(url).group('base_url', 'type', 'id') + + if url_type == 'share': + webpage = self._get_real_webpage(url, base_url, video_id, 'share') + meeting_id = self._get_page_data(webpage, video_id)['meetingId'] + redirect_path = self._download_json( + f'{base_url}nws/recording/1.0/play/share-info/{meeting_id}', + video_id, note='Downloading share info JSON')['result']['redirectUrl'] + url = urljoin(base_url, redirect_path) + + webpage = self._get_real_webpage(url, base_url, video_id, 'play') + file_id = self._get_page_data(webpage, video_id)['fileId'] + if not file_id: + # When things go wrong, file_id can be empty string + raise ExtractorError('Unable to extract file ID') data = self._download_json( - f'{base_url}nws/recording/1.0/play/info/{data["fileId"]}', play_id)['result'] + f'{base_url}nws/recording/1.0/play/info/{file_id}', video_id, + note='Downloading play info JSON')['result'] subtitles = {} for _type in ('transcript', 'cc', 'chapter'): @@ -91,7 +125,7 @@ def _real_extract(self, url): }) return { - 'id': play_id, + 'id': video_id, 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), 'subtitles': subtitles, 'formats': formats, From 925936908a3c3ee0e508621db14696b9f6a8b563 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:05:57 -0500 Subject: [PATCH 038/501] [extractor/tiktok] Fix and improve metadata extraction (#6777) Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 187 +++++++++++++++++++++++-------------- 1 file changed, 116 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index fb838d5298..63708229ee 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -287,17 +287,15 @@ def extract_addr(addr, add_meta={}): thumbnails = [] for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak', 'origin_cover', 'dynamic_cover'): - cover = video_info.get(cover_id) - if cover: - for cover_url in cover['url_list']: - thumbnails.append({ - 'id': cover_id, - 'url': cover_url, - }) + for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)): + thumbnails.append({ + 'id': cover_id, + 'url': cover_url, + }) - stats_info = aweme_detail.get('statistics', {}) - author_info = aweme_detail.get('author', {}) - music_info = aweme_detail.get('music', {}) + stats_info = aweme_detail.get('statistics') or {} + author_info = aweme_detail.get('author') or {} + music_info = aweme_detail.get('music') or {} user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, 'sec_uid', 'id', 'uid', 'unique_id', expected_type=str_or_none, get_all=False)) @@ -319,20 +317,27 @@ def extract_addr(addr, add_meta={}): 'extractor_key': TikTokIE.ie_key(), 'extractor': TikTokIE.IE_NAME, 'webpage_url': self._create_url(author_info.get('uid'), aweme_id), - 'title': aweme_detail.get('desc'), - 'description': aweme_detail.get('desc'), - 'view_count': int_or_none(stats_info.get('play_count')), - 'like_count': int_or_none(stats_info.get('digg_count')), - 'repost_count': int_or_none(stats_info.get('share_count')), - 'comment_count': int_or_none(stats_info.get('comment_count')), - 'uploader': str_or_none(author_info.get('unique_id')), - 'creator': str_or_none(author_info.get('nickname')), - 'uploader_id': str_or_none(author_info.get('uid')), + **traverse_obj(aweme_detail, { + 'title': ('desc', {str}), + 'description': ('desc', {str}), + 'timestamp': ('create_time', {int_or_none}), + }), + **traverse_obj(stats_info, { + 'view_count': 'play_count', + 'like_count': 'digg_count', + 'repost_count': 'share_count', + 'comment_count': 'comment_count', + }, expected_type=int_or_none), + **traverse_obj(author_info, { + 'uploader': 'unique_id', + 'uploader_id': 'uid', + 'creator': 'nickname', + 'channel_id': 'sec_uid', + }, expected_type=str_or_none), 'uploader_url': user_url, 'track': music_track, 'album': str_or_none(music_info.get('album')) or None, 'artist': music_author or None, - 'timestamp': int_or_none(aweme_detail.get('create_time')), 'formats': formats, 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), 'thumbnails': thumbnails, @@ -344,37 +349,27 @@ def extract_addr(addr, add_meta={}): '_format_sort_fields': ('quality', 'codec', 'size', 'br'), } - def _parse_aweme_video_web(self, aweme_detail, webpage_url): + def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id): video_info = aweme_detail['video'] author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={}) music_info = aweme_detail.get('music') or {} stats_info = aweme_detail.get('stats') or {} - user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, - 'secUid', 'id', 'uid', 'uniqueId', - expected_type=str_or_none, get_all=False) - or aweme_detail.get('authorSecId')) + channel_id = traverse_obj(author_info or aweme_detail, (('authorSecId', 'secUid'), {str}), get_all=False) + user_url = self._UPLOADER_URL_FORMAT % channel_id if channel_id else None formats = [] - play_url = video_info.get('playAddr') - width = video_info.get('width') - height = video_info.get('height') - if isinstance(play_url, str): - formats = [{ + width = int_or_none(video_info.get('width')) + height = int_or_none(video_info.get('height')) + + for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})): + formats.append({ 'url': self._proto_relative_url(play_url), 'ext': 'mp4', 'width': width, 'height': height, - }] - elif isinstance(play_url, list): - formats = [{ - 'url': self._proto_relative_url(url), - 'ext': 'mp4', - 'width': width, - 'height': height, - } for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none) if url] + }) - download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none) - if download_url: + for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})): formats.append({ 'format_id': 'download', 'url': self._proto_relative_url(download_url), @@ -382,38 +377,48 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url): 'width': width, 'height': height, }) + self._remove_duplicate_formats(formats) thumbnails = [] - for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'): - if aweme_detail.get(thumbnail_name): - thumbnails = [{ - 'url': self._proto_relative_url(aweme_detail[thumbnail_name]), - 'width': width, - 'height': height - }] + for thumb_url in traverse_obj(aweme_detail, ( + (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {url_or_none})): + thumbnails.append({ + 'url': self._proto_relative_url(thumb_url), + 'width': width, + 'height': height, + }) return { - 'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none), - 'title': aweme_detail.get('desc'), - 'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int), - 'view_count': int_or_none(stats_info.get('playCount')), - 'like_count': int_or_none(stats_info.get('diggCount')), - 'repost_count': int_or_none(stats_info.get('shareCount')), - 'comment_count': int_or_none(stats_info.get('commentCount')), - 'timestamp': int_or_none(aweme_detail.get('createTime')), - 'creator': str_or_none(author_info.get('nickname')), - 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')), - 'uploader_id': str_or_none(traverse_obj(author_info, 'id', 'uid', 'authorId')), + 'id': video_id, + **traverse_obj(aweme_detail, { + 'title': ('desc', {str}), + 'description': ('desc', {str}), + 'duration': ('video', 'duration', {int_or_none}), + 'timestamp': ('createTime', {int_or_none}), + }), + **traverse_obj(author_info or aweme_detail, { + 'creator': ('nickname', {str}), + 'uploader': (('uniqueId', 'author'), {str}), + 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}), + }, get_all=False), + **traverse_obj(stats_info, { + 'view_count': 'playCount', + 'like_count': 'diggCount', + 'repost_count': 'shareCount', + 'comment_count': 'commentCount', + }, expected_type=int_or_none), + **traverse_obj(music_info, { + 'track': 'title', + 'album': ('album', {lambda x: x or None}), + 'artist': 'authorName', + }, expected_type=str), + 'channel_id': channel_id, 'uploader_url': user_url, - 'track': str_or_none(music_info.get('title')), - 'album': str_or_none(music_info.get('album')) or None, - 'artist': str_or_none(music_info.get('authorName')), 'formats': formats, 'thumbnails': thumbnails, - 'description': str_or_none(aweme_detail.get('desc')), 'http_headers': { - 'Referer': webpage_url + 'Referer': webpage_url, } } @@ -447,7 +452,8 @@ class TikTokIE(TikTokBaseIE): 'artist': 'Ysrbeats', 'album': 'Lehanga', 'track': 'Lehanga', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', 'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b', @@ -462,6 +468,7 @@ class TikTokIE(TikTokBaseIE): 'uploader': 'patrox', 'uploader_id': '18702747', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', + 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', 'creator': 'patroX', 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', 'upload_date': '20190930', @@ -472,7 +479,7 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', 'track': 'Big Fun', - } + }, }, { # Banned audio, only available on the app 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402', @@ -485,6 +492,7 @@ class TikTokIE(TikTokBaseIE): 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', 'uploader_id': '6974687867511718913', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', + 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', 'track': 'Boka Dance', 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', 'timestamp': 1626121503, @@ -495,7 +503,7 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { # Sponsored video, only available with feed workaround 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561', @@ -508,6 +516,7 @@ class TikTokIE(TikTokBaseIE): 'creator': 'Slap And Run', 'uploader_id': '7036055384943690754', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', + 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', 'track': 'Promoted Music', 'timestamp': 1639754738, 'duration': 30, @@ -518,7 +527,6 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['trying with webpage', 'Unable to find video in feed'] }, { # Video without title and description 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', @@ -531,6 +539,7 @@ class TikTokIE(TikTokBaseIE): 'creator': 'Pokemon', 'uploader_id': '6820838815978423302', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', + 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', 'track': 'original sound', 'timestamp': 1643714123, 'duration': 6, @@ -577,6 +586,7 @@ class TikTokIE(TikTokBaseIE): 'uploader': '_le_cannibale_', 'uploader_id': '6604511138619654149', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', + 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', 'artist': 'nathan !', 'track': 'grahamscott canon', 'upload_date': '20220905', @@ -587,6 +597,33 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, 'thumbnail': r're:^https://.+\.webp', }, + }, { + # only available via web + 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662', + 'md5': '8d8c0be14127020cd9f5def4a2e6b411', + 'info_dict': { + 'id': '7206382937372134662', + 'ext': 'mp4', + 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a', + 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a', + 'creator': 'MoxyPatch', + 'uploader': 'moxypatch', + 'uploader_id': '7039142049363379205', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V', + 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V', + 'artist': 'your worst nightmare', + 'track': 'original sound', + 'upload_date': '20230303', + 'timestamp': 1677866781, + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+', + 'thumbnails': 'count:3', + }, + 'expected_warnings': ['Unable to find video in feed'], }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', @@ -612,7 +649,7 @@ def _real_extract(self, url): video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict) if status == 0: - return self._parse_aweme_video_web(video_data, url) + return self._parse_aweme_video_web(video_data, url, video_id) elif status == 10216: raise ExtractorError('This video is private', expected=True) raise ExtractorError('Video not available', video_id=video_id) @@ -839,6 +876,7 @@ class DouyinIE(TikTokBaseIE): 'description': '#杨超越 小小水手带你去远航❤️', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'creator': '杨超越', 'duration': 19782, 'timestamp': 1620905839, @@ -848,6 +886,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }, { 'url': 'https://www.douyin.com/video/6982497745948921092', @@ -859,8 +898,9 @@ class DouyinIE(TikTokBaseIE): 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'uploader_id': '408654318141572', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', + 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', 'creator': '杨超越工作室', - 'duration': 42608, + 'duration': 42479, 'timestamp': 1625739481, 'upload_date': '20210708', 'track': '@杨超越工作室创作的原声', @@ -868,6 +908,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }, { 'url': 'https://www.douyin.com/video/6953975910773099811', @@ -879,8 +920,9 @@ class DouyinIE(TikTokBaseIE): 'description': '#一起看海 出现在你的夏日里', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'creator': '杨超越', - 'duration': 17228, + 'duration': 17343, 'timestamp': 1619098692, 'upload_date': '20210422', 'track': '@杨超越创作的原声', @@ -888,6 +930,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }, { 'url': 'https://www.douyin.com/video/6950251282489675042', @@ -916,6 +959,7 @@ class DouyinIE(TikTokBaseIE): 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'creator': '杨超越', 'duration': 15115, 'timestamp': 1621261163, @@ -925,6 +969,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }] _APP_VERSIONS = [('23.3.0', '230300')] @@ -956,7 +1001,7 @@ def _real_extract(self, url): render_data = self._parse_json( render_data_json, video_id, transform_source=compat_urllib_parse_unquote) - return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url) + return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id) class TikTokVMIE(InfoExtractor): From 3f7e2bd80e3c5d8a1682f20a1b245fcd974f295d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:21:09 -0500 Subject: [PATCH 039/501] [FFmpegFixupM3u8PP] Check audio codec before fixup (#6778) Closes #6673 Authored by: bashonly --- yt_dlp/postprocessor/ffmpeg.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 63fc9ace65..323f4303c0 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -898,8 +898,11 @@ def _needs_fixup(self, info): @PostProcessor._restrict_to(images=False) def run(self, info): if all(self._needs_fixup(info)): + args = ['-f', 'mp4'] + if self.get_audio_codec(info['filepath']) == 'aac': + args.extend(['-bsf:a', 'aac_adtstoasc']) self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [ - *self.stream_copy_opts(), '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']) + *self.stream_copy_opts(), *args]) return [], info From 93e7c6995e07dafb9dcc06c0d06acf6c5bdfecc5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:36:06 -0500 Subject: [PATCH 040/501] [extractor/generic] Attempt to detect live HLS (#6775) * Extract duration for non-live generic HLS videos * Add extractor-arg `is_live` to bypass live HLS check Closes #6705 Authored by: bashonly --- README.md | 1 + yt_dlp/extractor/generic.py | 63 +++++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 3e8484314f..35229f728e 100644 --- a/README.md +++ b/README.md @@ -1800,6 +1800,7 @@ #### generic * `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg * `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE` * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist +* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 75355aeb5b..87cf11d6bd 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -14,6 +14,7 @@ ExtractorError, UnsupportedError, determine_ext, + determine_protocol, dict_get, extract_basic_auth, format_field, @@ -867,7 +868,7 @@ class GenericIE(InfoExtractor): }, }, { - # Video.js embed, multiple formats + # Youtube embed, formerly: Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', 'info_dict': { 'id': 'yygqldloqIk', @@ -894,6 +895,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # rtl.nl embed { @@ -2169,6 +2171,33 @@ class GenericIE(InfoExtractor): 'age_limit': 18, }, }, + { + 'note': 'Live HLS direct link', + 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8', + 'info_dict': { + 'id': 'index', + 'title': r're:index', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + 'note': 'Video.js VOD HLS', + 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html', + 'info_dict': { + 'id': 'videojs_hls_test', + 'title': 'video', + 'ext': 'mp4', + 'age_limit': 0, + 'duration': 1800, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def report_following_redirect(self, new_url): @@ -2205,6 +2234,22 @@ def _extra_manifest_info(self, info, manifest_url): for fmt in self._downloader._get_formats(info): fmt['url'] = update_url_query(fmt['url'], query) + # Attempt to detect live HLS or set VOD duration + m3u8_format = next((f for f in self._downloader._get_formats(info) + if determine_protocol(f) == 'm3u8_native'), None) + if m3u8_format: + is_live = self._configuration_arg('is_live', [None])[0] + if is_live is not None: + info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' + return + headers = m3u8_format.get('http_headers') or info.get('http_headers') + duration = self._extract_m3u8_vod_duration( + m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', + errnote='Failed to download m3u8 media playlist', headers=headers) + if not duration: + info['live_status'] = 'is_live' + info['duration'] = info.get('duration') or duration + def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', @@ -2580,8 +2625,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): varname = mobj.group(1) sources = variadic(self._parse_json( mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) - formats = [] - subtitles = {} + formats, subtitles, src = [], {}, None for source in sources: src = source.get('src') if not src or not isinstance(src, str): @@ -2604,8 +2648,6 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - for fmt in formats: - self._extra_manifest_info(fmt, src) if not formats: formats.append({ @@ -2621,11 +2663,11 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): sub = self._parse_json( sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} - src = str_or_none(sub.get('src')) - if not src: + sub_src = str_or_none(sub.get('src')) + if not sub_src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': urllib.parse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, sub_src), 'name': sub.get('label'), 'http_headers': { 'Referer': actual_url, @@ -2633,7 +2675,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): }) if formats or subtitles: self.report_detected('video.js embed') - return [{'formats': formats, 'subtitles': subtitles}] + info_dict = {'formats': formats, 'subtitles': subtitles} + if formats: + self._extra_manifest_info(info_dict, src) + return [info_dict] # Look for generic KVS player (before json-ld bc of some urls that break otherwise) found = self._search_regex(( From 7666b93604b97e9ada981c6b04ccf5605dd1bd44 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Fri, 14 Apr 2023 07:58:36 +0000 Subject: [PATCH 041/501] [extractor/youtube] Define strict uploader metadata mapping (#6384) New mapping: ``` channel -> channel name channel_id -> UCID channel_url -> UCID channel url uploader -> channel name (same as channel field) uploader_id -> @handle uploader_url -> @handle channel url ``` Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 709 +++++++++++++++++++----------------- 1 file changed, 371 insertions(+), 338 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d6a55e9532..2b17751e5e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -458,6 +458,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en + _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + + def ucid_or_none(self, ucid): + return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) + + def handle_or_none(self, handle): + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None) + + def handle_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', + url, 'channel handle', default=None) + + def ucid_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', + url, 'channel id', default=None) + @functools.cached_property def _preferred_lang(self): """ @@ -992,6 +1009,8 @@ def _extract_video(self, renderer): if not channel_id: channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) + channel_id = self.ucid_or_none(channel_id) + overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) @@ -1233,9 +1252,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -1254,7 +1270,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'start_time': 1, 'end_time': 9, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Philipp Hagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader_id': '@PhilippHagemeister', } }, { @@ -1266,9 +1285,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120608', 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, }, 'skip': 'Private video', @@ -1280,9 +1296,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -1299,7 +1312,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'age_limit': 0, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Philipp Hagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader_id': '@PhilippHagemeister', }, 'params': { 'skip_download': True, @@ -1312,10 +1328,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', - 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -1333,8 +1346,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, 'like_count': int, @@ -1346,13 +1357,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp', 'channel': 'Afrojack', - 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO', 'tags': 'count:19', 'availability': 'public', 'categories': ['Music'], 'age_limit': 0, 'alt_title': 'The Spark', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Afrojack', + 'uploader_url': 'https://www.youtube.com/@Afrojack', + 'uploader_id': '@Afrojack', }, 'params': { 'youtube_include_dash_manifest': True, @@ -1369,9 +1382,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'age_limit': 18, 'categories': ['Gaming'], @@ -1385,7 +1395,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg', 'playable_in_embed': True, 'view_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'The Witcher', + 'uploader_url': 'https://www.youtube.com/@thewitcher', + 'uploader_id': '@thewitcher', }, }, { @@ -1397,12 +1410,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Godzilla 2 (Official Video)', 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'upload_date': '20200408', - 'uploader_id': 'FlyingKitty900', - 'uploader': 'FlyingKitty', 'age_limit': 18, 'availability': 'needs_auth', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', - 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900', 'channel': 'FlyingKitty', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', 'view_count': int, @@ -1413,7 +1423,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'duration': 177, 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'FlyingKitty', + 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', + 'uploader_id': '@FlyingKitty900', }, }, { @@ -1424,13 +1437,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', 'ext': 'mp4', 'upload_date': '20191228', - 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', - 'uploader': 'Projekt Melody', 'description': 'md5:17eccca93a786d51bc67646756894066', 'age_limit': 18, 'like_count': int, 'availability': 'needs_auth', - 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'view_count': int, 'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp', @@ -1442,7 +1452,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 106, 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Projekt Melody', + 'uploader_url': 'https://www.youtube.com/@ProjektMelody', + 'uploader_id': '@ProjektMelody', }, }, { @@ -1452,8 +1465,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'MeJVWBSsPAY', 'ext': 'mp4', 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', - 'uploader': 'Herr Lurik', - 'uploader_id': 'st3in234', 'description': 'Fan Video. Music & Lyrics by OOMPH!.', 'upload_date': '20130730', 'track': 'Such mich find mich', @@ -1470,11 +1481,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA', 'categories': ['Music'], 'availability': 'public', - 'uploader_url': 'http://www.youtube.com/user/st3in234', 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA', 'live_status': 'not_live', 'artist': 'OOMPH!', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Herr Lurik', + 'uploader_url': 'https://www.youtube.com/@HerrLurik', + 'uploader_id': '@HerrLurik', }, }, { @@ -1491,11 +1504,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 266, 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', - 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', 'availability': 'public', @@ -1513,7 +1523,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ', 'categories': ['Music'], 'album': 'Some Chords', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'deadmau5', + 'uploader_url': 'https://www.youtube.com/@deadmau5', + 'uploader_id': '@deadmau5', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1527,10 +1540,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 6085, 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'description': 'md5:04bbbf3ccceb6795947572ca36f45904', - 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', 'like_count': int, 'release_timestamp': 1343767800, @@ -1546,7 +1556,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'was_live', 'view_count': int, 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Olympics', + 'uploader_url': 'https://www.youtube.com/@Olympics', + 'uploader_id': '@Olympics', }, 'params': { 'skip_download': 'requires avconv', @@ -1561,10 +1574,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'duration': 85, 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', 'playable_in_embed': True, 'channel': '孫ᄋᄅ', @@ -1579,7 +1589,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'availability': 'unlisted', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': '孫ᄋᄅ', + 'uploader_url': 'https://www.youtube.com/@AllenMeow', + 'uploader_id': '@AllenMeow', }, }, # url_encoded_fmt_stream_map is empty string @@ -1591,8 +1604,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', }, 'params': { 'skip_download': 'requires avconv', @@ -1609,9 +1620,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', 'formats': 'mincount:31', }, 'skip': 'not actual anymore', @@ -1624,9 +1632,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'CsmdDsKjzN8', 'ext': 'mp4', 'upload_date': '20150501', # According to ' 1 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', - 'thumbnails': list + 'thumbnails': list, + 'uploader_id': '@kurzgesagt', + 'uploader_url': 'https://www.youtube.com/@kurzgesagt', + 'uploader': 'Kurzgesagt – In a Nutshell', } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6989,11 +7021,12 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'title': 'Mobile Games on Console - Scott The Woz', 'upload_date': '20210920', 'uploader': 'Scott The Woz', - 'uploader_id': 'scottthewoz', - 'uploader_url': 'http://www.youtube.com/user/scottthewoz', + 'uploader_id': '@ScottTheWoz', + 'uploader_url': 'https://www.youtube.com/@ScottTheWoz', 'view_count': int, 'live_status': 'not_live', - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': 'count:20', } }] @@ -7031,13 +7064,13 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'id': 'qVv6vCqciTM', 'ext': 'mp4', 'age_limit': 0, - 'uploader_id': 'UCIdEIHpS0TdkqRkHL5OkLtA', + 'uploader_id': '@sana_natori', 'comment_count': int, 'chapters': 'count:13', 'upload_date': '20221223', 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg', 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', - 'uploader_url': 'http://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', + 'uploader_url': 'https://www.youtube.com/@sana_natori', 'like_count': int, 'release_date': '20221223', 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'], From 84ffeb7d5e72e3829319ba7720a8480fc4c7503b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 16 Apr 2023 03:16:23 +0530 Subject: [PATCH 042/501] [extractor] Do not warn for invalid chapter data in description Fixes https://github.com/yt-dlp/yt-dlp/issues/6811#issuecomment-1509876209 --- yt_dlp/extractor/common.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 838899052c..78288f8091 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3658,18 +3658,22 @@ def _extract_chapters_helper(self, chapter_list, start_function, title_function, 'start_time': start_function(chapter), 'title': title_function(chapter), } for chapter in chapter_list or []] - if not strict: + if strict: + warn = self.report_warning + else: + warn = self.write_debug chapter_list.sort(key=lambda c: c['start_time'] or 0) chapters = [{'start_time': 0}] for idx, chapter in enumerate(chapter_list): if chapter['start_time'] is None: - self.report_warning(f'Incomplete chapter {idx}') + warn(f'Incomplete chapter {idx}') elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: chapters.append(chapter) elif chapter not in chapters: - self.report_warning( - f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') + issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration + else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}') + warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"') return chapters[1:] def _extract_chapters_from_description(self, description, duration): From 9874e82b5a61582169300bea561b3e8899ad1ef7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 16 Apr 2023 08:54:48 +0530 Subject: [PATCH 043/501] Do not translate newlines in `--print-to-file` Fixes https://github.com/yt-dlp/yt-dlp/issues/6808#issuecomment-1509361107 --- yt_dlp/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7b6fef2041..31f7645dca 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2890,8 +2890,8 @@ def format_tmpl(tmpl): tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): - with open(filename, 'a', encoding='utf-8') as f: - f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n') + with open(filename, 'a', encoding='utf-8', newline='') as f: + f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep) def __forced_printings(self, info_dict, filename, incomplete): def print_mandatory(field, actual_field=None): From ea0570820336a0fe9c3b530d1b0d1e59313274f4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 16 Apr 2023 12:01:19 -0500 Subject: [PATCH 044/501] [extractor/adobepass] Handle `Charter_Direct` MSO as `Spectrum` (#6824) Authored by: bashonly --- yt_dlp/extractor/adobepass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index e5944f7146..68a970f68c 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1573,7 +1573,7 @@ def extract_redirect_url(html, url=None, fatal=False): }), headers={ 'Content-Type': 'application/x-www-form-urlencoded' }) - elif mso_id == 'Spectrum': + elif mso_id in ('Spectrum', 'Charter_Direct'): # Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow # as a one-off implementation. provider_redirect_page, urlh = provider_redirect_page_res From 7a6f6f24592a8065376f11a58e44878807732cf6 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 16 Apr 2023 12:07:55 -0500 Subject: [PATCH 045/501] [extractor/reddit] Support cookies and short URLs (#6825) Closes #6665, Closes #6753 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 72 ++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 9dba3eca8f..3e458456c1 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -1,4 +1,3 @@ -import random import urllib.parse from .common import InfoExtractor @@ -14,7 +13,7 @@ class RedditIE(InfoExtractor): - _VALID_URL = r'https?://(?P[^/]+\.)?reddit(?:media)?\.com/(?P(?:r|user)/[^/]+/comments/(?P[^/?#&]+))' + _VALID_URL = r'https?://(?P(?:\w+\.)?reddit(?:media)?\.com)/(?P(?:(?:r|user)/[^/]+/)?comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -109,6 +108,46 @@ class RedditIE(InfoExtractor): 'age_limit': 0, 'channel_id': 'dumbfuckers_club', }, + }, { + # post link without subreddit + 'url': 'https://www.reddit.com/comments/124pp33', + 'md5': '15eec9d828adcef4468b741a7e45a395', + 'info_dict': { + 'id': 'antsenjc2jqa1', + 'ext': 'mp4', + 'display_id': '124pp33', + 'title': 'Harmless prank of some old friends', + 'uploader': 'Dudezila', + 'channel_id': 'ContagiousLaughter', + 'duration': 17, + 'upload_date': '20230328', + 'timestamp': 1680012043, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'age_limit': 0, + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + }, { + # quarantined subreddit post + 'url': 'https://old.reddit.com/r/GenZedong/comments/12fujy3/based_hasan/', + 'md5': '3156ea69e3c1f1b6259683c5abd36e71', + 'info_dict': { + 'id': '8bwtclfggpsa1', + 'ext': 'mp4', + 'display_id': '12fujy3', + 'title': 'Based Hasan?', + 'uploader': 'KingNigelXLII', + 'channel_id': 'GenZedong', + 'duration': 16, + 'upload_date': '20230408', + 'timestamp': 1680979138, + 'age_limit': 0, + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + 'skip': 'Requires account that has opted-in to the GenZedong subreddit', }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, @@ -137,21 +176,26 @@ class RedditIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _gen_session_id(): - id_length = 16 - rand_max = 1 << (id_length * 4) - return '%0.*x' % (id_length, random.randrange(rand_max)) - def _real_extract(self, url): - subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id') + host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') - self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id()) - self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D') - data = self._download_json(f'https://{subdomain}reddit.com/{slug}/.json', video_id, fatal=False) + data = self._download_json( + f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403) if not data: - # Fall back to old.reddit.com in case the requested subdomain fails - data = self._download_json(f'https://old.reddit.com/{slug}/.json', video_id) + fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com' + self.to_screen(f'{host} request failed, retrying with {fallback_host}') + data = self._download_json( + f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403) + + if traverse_obj(data, 'error') == 403: + reason = data.get('reason') + if reason == 'quarantined': + self.raise_login_required('Quarantined subreddit; an account that has opted in is required') + elif reason == 'private': + self.raise_login_required('Private subreddit; an account that has been approved is required') + else: + raise ExtractorError(f'HTTP Error 403 Forbidden; reason given: {reason}') + data = data[0]['data']['children'][0]['data'] video_url = data['url'] From 9c92b803fa24e48543ce969468d5404376e315b7 Mon Sep 17 00:00:00 2001 From: satan1st Date: Sun, 16 Apr 2023 19:20:10 +0200 Subject: [PATCH 046/501] [extractor/gronkh] Extract duration and chapters (#6817) Authored by: satan1st --- yt_dlp/extractor/gronkh.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py index b9370e36c1..1ae0a68936 100644 --- a/yt_dlp/extractor/gronkh.py +++ b/yt_dlp/extractor/gronkh.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from ..utils import ( OnDemandPagedList, + float_or_none, traverse_obj, unified_strdate, ) @@ -19,7 +20,9 @@ class GronkhIE(InfoExtractor): 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', - 'upload_date': '20221111' + 'upload_date': '20221111', + 'chapters': 'count:3', + 'duration': 31463, }, 'params': {'skip_download': True} }, { @@ -30,7 +33,8 @@ class GronkhIE(InfoExtractor): 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', - 'upload_date': '20211001' + 'upload_date': '20211001', + 'duration': 32058, }, 'params': {'skip_download': True} }, { @@ -56,6 +60,12 @@ def _real_extract(self, url): 'upload_date': unified_strdate(data_json.get('created_at')), 'formats': formats, 'subtitles': subtitles, + 'duration': float_or_none(data_json.get('source_length')), + 'chapters': traverse_obj(data_json, ( + 'chapters', lambda _, v: float_or_none(v['offset']) is not None, { + 'title': 'title', + 'start_time': ('offset', {float_or_none}), + })) or None, } From 2c566ed14101673c651c08c306c30fa5b4010b85 Mon Sep 17 00:00:00 2001 From: CoryTibbettsDev <70112527+CoryTibbettsDev@users.noreply.github.com> Date: Sun, 16 Apr 2023 17:26:37 +0000 Subject: [PATCH 047/501] [extractor/whyp] Add extractor (#6803) Authored by: CoryTibbettsDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/whyp.py | 50 +++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 yt_dlp/extractor/whyp.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 09903423d8..b08b3095e7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2295,6 +2295,7 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE from .wimtv import WimTVIE diff --git a/yt_dlp/extractor/whyp.py b/yt_dlp/extractor/whyp.py new file mode 100644 index 0000000000..fef89c3518 --- /dev/null +++ b/yt_dlp/extractor/whyp.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + str_or_none, + traverse_obj, + url_or_none, +) + + +class WhypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7', + 'md5': 'c1187b42ebf8605284e3dc92aeb33d16', + 'info_dict': { + 'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3', + 'id': '18337', + 'title': 'Home Page Example Track', + 'description': 'md5:bd758000fb93f3159339c852b5b9133c', + 'ext': 'mp3', + 'duration': 52.82, + 'uploader': 'Brad', + 'uploader_id': '1', + 'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg', + }, + }, { + 'url': 'https://www.whyp.it/tracks/18337', + 'only_matching': True, + }] + + def _real_extract(self, url): + unique_id = self._match_id(url) + webpage = self._download_webpage(url, unique_id) + data = self._search_nuxt_data(webpage, unique_id)['rawTrack'] + + return { + 'url': data['audio_url'], + 'id': unique_id, + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {float_or_none}), + 'uploader': ('user', 'username'), + 'uploader_id': ('user', 'id', {str_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + }), + 'ext': 'mp3', + 'vcodec': 'none', + 'http_headers': {'Referer': 'https://whyp.it/'}, + } From cbdf9408e6f1e35e98fd6477b3d6902df5b8a47f Mon Sep 17 00:00:00 2001 From: zhgwn <130610452+zhgwn@users.noreply.github.com> Date: Tue, 18 Apr 2023 04:18:29 +0200 Subject: [PATCH 048/501] [extractor/pornez] Support new URL formats (#6792) Closes #6791, Closes #6298 Authored by: zhgwn --- yt_dlp/extractor/pornez.py | 64 ++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/pornez.py b/yt_dlp/extractor/pornez.py index 3a22cb8210..bc45f865e9 100644 --- a/yt_dlp/extractor/pornez.py +++ b/yt_dlp/extractor/pornez.py @@ -1,42 +1,60 @@ from .common import InfoExtractor -from ..utils import int_or_none, urljoin +from ..utils import ( + clean_html, + int_or_none, + get_element_by_class, + urljoin, +) class PornezIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P[0-9]+)/' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?pornez\.net/(?:video(?P\w+)|watch)/' + _TESTS = [{ 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/', - 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc', 'info_dict': { 'id': '344819', 'ext': 'mp4', - 'title': r'mistresst funny_penis_names wmv', + 'title': 'mistresst funny_penis_names wmv', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, - } - } + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://pornez.net/watch/leana+lovings+stiff+for+stepdaughter/', + 'info_dict': { + 'id': '156161', + 'ext': 'mp4', + 'title': 'Watch leana lovings stiff for stepdaughter porn video.', + 'age_limit': 18, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://pornez.net/videovzs27fj/tutor4k-e14-blue-wave-1080p-nbq-tutor4k-e14-blue-wave/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - iframe_src = self._html_search_regex( - r']+src="([^"]+)"', webpage, 'iframe', fatal=True) - iframe_src = urljoin('https://pornez.net', iframe_src) - title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None) - if title is None: - title = self._search_regex(r'

(.*?)

', webpage, 'title', fatal=True) - thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None) - webpage = self._download_webpage(iframe_src, video_id) - entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0] - for format in entries['formats']: - height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height') - format['format_id'] = '%sp' % height - format['height'] = int_or_none(height) + if not video_id: + video_id = self._search_regex( + r']+\bhref=["\']https?://pornez.net/\?p=(\w+)["\']', webpage, 'id') + + iframe_src = self._html_search_regex(r']+src="([^"]+)"', webpage, 'iframe') + iframe = self._download_webpage(urljoin('https://pornez.net', iframe_src), video_id) + + entries = self._parse_html5_media_entries(iframe_src, iframe, video_id)[0] + for fmt in entries['formats']: + height = self._search_regex(r'_(\d+)\.m3u8', fmt['url'], 'height') + fmt['format_id'] = '%sp' % height + fmt['height'] = int_or_none(height) entries.update({ 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': 18 + 'title': (clean_html(get_element_by_class('video-title', webpage)) + or self._html_search_meta( + ['twitter:title', 'og:title', 'description'], webpage, 'title', default=None)), + 'thumbnail': self._html_search_meta(['thumbnailUrl'], webpage, 'thumb', default=None), + 'age_limit': 18, }) return entries From e5265dc6517478e589ee3c1ff0cb19bdf4e35ce1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 17 Apr 2023 21:27:33 -0500 Subject: [PATCH 049/501] [extractor/stageplus] Add extractor (#6838) Closes #6806 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/stageplus.py | 518 ++++++++++++++++++++++++++++++++ 2 files changed, 519 insertions(+) create mode 100644 yt_dlp/extractor/stageplus.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b08b3095e7..deb92b5fce 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1786,6 +1786,7 @@ BellatorIE, ParamountNetworkIE, ) +from .stageplus import StagePlusVODConcertIE from .startrek import StarTrekIE from .stitcher import ( StitcherIE, diff --git a/yt_dlp/extractor/stageplus.py b/yt_dlp/extractor/stageplus.py new file mode 100644 index 0000000000..adb4ebbc2d --- /dev/null +++ b/yt_dlp/extractor/stageplus.py @@ -0,0 +1,518 @@ +import json +import uuid + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + traverse_obj, + try_call, + unified_timestamp, + url_or_none, +) + + +class StagePlusVODConcertIE(InfoExtractor): + _NETRC_MACHINE = 'stageplus' + _VALID_URL = r'https?://(?:www\.)?stage-plus\.com/video/(?Pvod_concert_\w+)' + _TESTS = [{ + 'url': 'https://www.stage-plus.com/video/vod_concert_APNM8GRFDPHMASJKBSPJACG', + 'playlist_count': 6, + 'info_dict': { + 'id': 'vod_concert_APNM8GRFDPHMASJKBSPJACG', + 'title': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz', + 'description': 'md5:50f78ec180518c9bdb876bac550996fc', + 'artist': ['Yuja Wang', 'Lorenzo Viotti'], + 'upload_date': '20230331', + 'timestamp': 1680249600, + 'release_date': '20210709', + 'release_timestamp': 1625788800, + 'thumbnails': 'count:3', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'performance_work_A1IN4PJFE9MM2RJ3CLBMUSJBBSOJAD9O', + 'ext': 'mp4', + 'title': 'Piano Concerto No. 2 in C Minor, Op. 18', + 'description': 'md5:50f78ec180518c9bdb876bac550996fc', + 'upload_date': '20230331', + 'timestamp': 1680249600, + 'release_date': '20210709', + 'release_timestamp': 1625788800, + 'duration': 2207, + 'chapters': 'count:5', + 'artist': ['Yuja Wang'], + 'composer': ['Sergei Rachmaninoff'], + 'album': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz', + 'album_artist': ['Yuja Wang', 'Lorenzo Viotti'], + 'track': 'Piano Concerto No. 2 in C Minor, Op. 18', + 'track_number': 1, + 'genre': 'Instrumental Concerto', + }, + }], + 'params': {'skip_download': 'm3u8'}, + }] + + # TODO: Prune this after livestream and/or album extractors are added + _GRAPHQL_QUERY = '''query videoDetailPage($videoId: ID!, $sliderItemsFirst: Int = 24) { + node(id: $videoId) { + __typename + ...LiveConcertFields + ... on LiveConcert { + artists { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } + } + isAtmos + maxResolution + groups { + id + name + typeDisplayName + } + shortDescription + performanceWorks { + ...livePerformanceWorkFields + } + totalDuration + sliders { + ...contentContainerFields + } + vodConcert { + __typename + id + } + } + ...VideoFields + ... on Video { + artists { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } + } + isAtmos + maxResolution + isLossless + description + productionDate + takedownDate + sliders { + ...contentContainerFields + } + } + ...VodConcertFields + ... on VodConcert { + artists { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } + } + isAtmos + maxResolution + groups { + id + name + typeDisplayName + } + performanceWorks { + ...PerformanceWorkFields + } + shortDescription + productionDate + takedownDate + sliders { + ...contentContainerFields + } + } + } +} + +fragment LiveConcertFields on LiveConcert { + endTime + id + pictures { + ...PictureFields + } + reruns { + ...liveConcertRerunFields + } + publicationLevel + startTime + streamStartTime + subtitle + title + typeDisplayName + stream { + ...liveStreamFields + } + trailerStream { + ...streamFields + } + geoAccessCountries + geoAccessMode +} + +fragment PictureFields on Picture { + id + url + type +} + +fragment liveConcertRerunFields on LiveConcertRerun { + streamStartTime + endTime + startTime + stream { + ...rerunStreamFields + } +} + +fragment rerunStreamFields on RerunStream { + publicationLevel + streamType + url +} + +fragment liveStreamFields on LiveStream { + publicationLevel + streamType + url +} + +fragment streamFields on Stream { + publicationLevel + streamType + url +} + +fragment RoleFields on Role { + __typename + id + type + displayName +} + +fragment livePerformanceWorkFields on LivePerformanceWork { + __typename + id + artists { + ...artistWithRoleFields + } + groups { + edges { + node { + id + name + typeDisplayName + } + } + } + work { + ...workFields + } +} + +fragment artistWithRoleFields on ArtistWithRoleConnection { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } +} + +fragment workFields on Work { + id + title + movements { + id + title + } + composers { + id + name + } + genre { + id + title + } +} + +fragment contentContainerFields on CuratedContentContainer { + __typename + ...SliderFields + ...BannerFields +} + +fragment SliderFields on Slider { + id + headline + items(first: $sliderItemsFirst) { + edges { + node { + id + __typename + ...AlbumFields + ...ArtistFields + ...EpochFields + ...GenreFields + ...GroupFields + ...LiveConcertFields + ...PartnerFields + ...PerformanceWorkFields + ...VideoFields + ...VodConcertFields + } + } + } +} + +fragment AlbumFields on Album { + artistAndGroupDisplayInfo + id + pictures { + ...PictureFields + } + title +} + +fragment ArtistFields on Artist { + id + name + roles { + ...RoleFields + } + pictures { + ...PictureFields + } +} + +fragment EpochFields on Epoch { + id + endYear + pictures { + ...PictureFields + } + startYear + title +} + +fragment GenreFields on Genre { + id + pictures { + ...PictureFields + } + title +} + +fragment GroupFields on Group { + id + name + typeDisplayName + pictures { + ...PictureFields + } +} + +fragment PartnerFields on Partner { + id + name + typeDisplayName + subtypeDisplayName + pictures { + ...PictureFields + } +} + +fragment PerformanceWorkFields on PerformanceWork { + __typename + id + artists { + ...artistWithRoleFields + } + groups { + edges { + node { + id + name + typeDisplayName + } + } + } + work { + ...workFields + } + stream { + ...streamFields + } + vodConcert { + __typename + id + } + duration + cuePoints { + mark + title + } +} + +fragment VideoFields on Video { + id + archiveReleaseDate + title + subtitle + pictures { + ...PictureFields + } + stream { + ...streamFields + } + trailerStream { + ...streamFields + } + duration + typeDisplayName + duration + geoAccessCountries + geoAccessMode + publicationLevel + takedownDate +} + +fragment VodConcertFields on VodConcert { + id + archiveReleaseDate + pictures { + ...PictureFields + } + subtitle + title + typeDisplayName + totalDuration + geoAccessCountries + geoAccessMode + trailerStream { + ...streamFields + } + publicationLevel + takedownDate +} + +fragment BannerFields on Banner { + description + link + pictures { + ...PictureFields + } + title +}''' + + _TOKEN = None + + def _perform_login(self, username, password): + auth = self._download_json('https://audience.api.stageplus.io/oauth/token', None, headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://www.stage-plus.com', + }, data=json.dumps({ + 'grant_type': 'password', + 'username': username, + 'password': password, + 'device_info': 'Chrome (Windows)', + 'client_device_id': str(uuid.uuid4()), + }, separators=(',', ':')).encode(), note='Logging in') + + if auth.get('access_token'): + self._TOKEN = auth['access_token'] + + def _real_initialize(self): + if self._TOKEN: + return + + self._TOKEN = try_call( + lambda: self._get_cookies('https://www.stage-plus.com/')['dgplus_access_token'].value) + if not self._TOKEN: + self.raise_login_required() + + def _real_extract(self, url): + concert_id = self._match_id(url) + + data = self._download_json('https://audience.api.stageplus.io/graphql', concert_id, headers={ + 'authorization': f'Bearer {self._TOKEN}', + 'content-type': 'application/json', + 'Origin': 'https://www.stage-plus.com', + }, data=json.dumps({ + 'query': self._GRAPHQL_QUERY, + 'variables': {'videoId': concert_id}, + 'operationName': 'videoDetailPage' + }, separators=(',', ':')).encode())['data']['node'] + + metadata = traverse_obj(data, { + 'title': 'title', + 'description': ('shortDescription', {str}), + 'artist': ('artists', 'edges', ..., 'node', 'name'), + 'timestamp': ('archiveReleaseDate', {unified_timestamp}), + 'release_timestamp': ('productionDate', {unified_timestamp}), + }) + + thumbnails = traverse_obj(data, ('pictures', lambda _, v: url_or_none(v['url']), { + 'id': 'name', + 'url': 'url', + })) or None + + m3u8_headers = {'jwt': self._TOKEN} + + entries = [] + for idx, video in enumerate(traverse_obj(data, ( + 'performanceWorks', lambda _, v: v['id'] and url_or_none(v['stream']['url']))), 1): + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + video['stream']['url'], video['id'], 'mp4', m3u8_id='hls', headers=m3u8_headers) + entries.append({ + 'id': video['id'], + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': m3u8_headers, + 'album': metadata.get('title'), + 'album_artist': metadata.get('artist'), + 'track_number': idx, + **metadata, + **traverse_obj(video, { + 'title': ('work', 'title'), + 'track': ('work', 'title'), + 'duration': ('duration', {float_or_none}), + 'chapters': ( + 'cuePoints', lambda _, v: float_or_none(v['mark']) is not None, { + 'title': 'title', + 'start_time': ('mark', {float_or_none}), + }), + 'artist': ('artists', 'edges', ..., 'node', 'name'), + 'composer': ('work', 'composers', ..., 'name'), + 'genre': ('work', 'genre', 'title'), + }), + }) + + return self.playlist_result(entries, concert_id, thumbnails=thumbnails, **metadata) From ab29e47029e2f5b48abbbab78e82faf7cf6e9506 Mon Sep 17 00:00:00 2001 From: qbnu <93988953+qbnu@users.noreply.github.com> Date: Tue, 18 Apr 2023 02:37:37 +0000 Subject: [PATCH 050/501] [extractor/bilibili] Support festival videos (#6547) Closes #6138 Authored by: qbnu --- yt_dlp/extractor/bilibili.py | 86 ++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 91d436dd85..faa2218ced 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -134,7 +134,7 @@ def _get_all_children(self, reply): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P[^/?#&]+)' + _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -282,19 +282,60 @@ class BiliBiliIE(BilibiliBaseIE): 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, + }, { + 'note': 'video redirects to festival page', + 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', + 'info_dict': { + 'id': 'BV1wP4y1P72h', + 'ext': 'mp4', + 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', + 'timestamp': 1643947497, + 'upload_date': '20220204', + 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', + 'uploader': '叨叨冯聊音乐', + 'duration': 246.719, + 'uploader_id': '528182630', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, + }, { + 'note': 'newer festival video', + 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', + 'info_dict': { + 'id': 'BV1ay4y1d77f', + 'ext': 'mp4', + 'title': '【崩坏3新春剧场】为特别的你送上祝福!', + 'timestamp': 1674273600, + 'upload_date': '20230121', + 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', + 'uploader': '果蝇轰', + 'duration': 1111.722, + 'uploader_id': '8469526', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - video_data = initial_state['videoData'] + is_festival = 'videoData' not in initial_state + if is_festival: + video_data = initial_state['videoInfo'] + else: + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = traverse_obj( + page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, @@ -317,20 +358,39 @@ def _real_extract(self, url): cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') + festival_info = {} + if is_festival: + play_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl', video_id, + query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, + note='Extracting festival video formats')['data'] + + festival_info = traverse_obj(initial_state, { + 'uploader': ('videoInfo', 'upName'), + 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), + 'like_count': ('videoStatus', 'like', {int_or_none}), + 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), + }, get_all=False) + return { + **traverse_obj(initial_state, { + 'uploader': ('upData', 'name'), + 'uploader_id': ('upData', 'mid', {str_or_none}), + 'like_count': ('videoData', 'stat', 'like', {int_or_none}), + 'tags': ('tags', ..., 'tag_name'), + 'thumbnail': ('videoData', 'pic', {url_or_none}), + }), + **festival_info, + **traverse_obj(video_data, { + 'description': 'desc', + 'timestamp': ('pubdate', {int_or_none}), + 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), + 'comment_count': ('stat', 'reply', {int_or_none}), + }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'description': traverse_obj(initial_state, ('videoData', 'desc')), - 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), - 'uploader': traverse_obj(initial_state, ('upData', 'name')), - 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), - 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), - 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), - 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), - 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), - 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), 'subtitles': self.extract_subtitles(video_id, aid, cid), From 6a765f135ccb654861336ea27a2c1c24ea8e286f Mon Sep 17 00:00:00 2001 From: vidiot720 <128325907+vidiot720@users.noreply.github.com> Date: Wed, 19 Apr 2023 09:46:57 +1000 Subject: [PATCH 051/501] [extractor/sbs] Overhaul extractor for new API (#6839) Closes #6543 Authored by: vidiot720, dirkf, bashonly --- yt_dlp/extractor/sbs.py | 109 ++++++++++++++++++++++++++++++---------- yt_dlp/utils.py | 4 ++ 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 45320339da..ac0b6de202 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -1,7 +1,13 @@ from .common import InfoExtractor from ..utils import ( - smuggle_url, - ExtractorError, + HEADRequest, + float_or_none, + int_or_none, + parse_duration, + parse_iso8601, + traverse_obj, + update_url_query, + url_or_none, ) @@ -11,7 +17,7 @@ class SBSIE(InfoExtractor): https?://(?:www\.)?sbs\.com\.au/(?: ondemand(?: /video/(?:single/)?| - /movie/[^/]+/| + /(?:movie|tv-program)/[^/]+/| /(?:tv|news)-series/(?:[^/]+/){3}| .*?\bplay=|/watch/ )|news/(?:embeds/)?video/ @@ -27,18 +33,21 @@ class SBSIE(InfoExtractor): # Original URL is handled by the generic IE which finds the iframe: # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', - 'md5': '3150cf278965eeabb5b4cea1c963fe0a', + 'md5': '31f84a7a19b53635db63c73f8ab0c4a7', 'info_dict': { - 'id': '_rFBPRPO4pMR', + 'id': '320403011771', # '_rFBPRPO4pMR', 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'duration': 308, 'timestamp': 1408613220, 'upload_date': '20140821', 'uploader': 'SBSC', + 'tags': None, + 'categories': None, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', 'only_matching': True, @@ -70,34 +79,80 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/tv-program/autun-romes-forgotten-sister/2116212803602', + 'only_matching': True, }] + _GEO_COUNTRIES = ['AU'] + _AUS_TV_PARENTAL_GUIDELINES = { + 'P': 0, + 'C': 7, + 'G': 0, + 'PG': 0, + 'M': 14, + 'MA15+': 15, + 'MAV15+': 15, + 'R18+': 18, + } + _PLAYER_API = 'https://www.sbs.com.au/api/v3' + def _real_extract(self, url): video_id = self._match_id(url) - player_params = self._download_json( - 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) + formats, subtitles = self._extract_smil_formats_and_subtitles( + update_url_query(f'{self._PLAYER_API}/video_smil', {'id': video_id}), video_id) - error = player_params.get('error') - if error: - error_message = 'Sorry, The video you are looking for does not exist.' - video_data = error.get('results') or {} - error_code = error.get('errorCode') - if error_code == 'ComingSoon': - error_message = '%s is not yet available.' % video_data.get('title', '') - elif error_code in ('Forbidden', 'intranetAccessOnly'): - error_message = 'Sorry, This video cannot be accessed via this website' - elif error_code == 'Expired': - error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + if not formats: + urlh = self._request_webpage( + HEADRequest('https://sbs-vod-prod-01.akamaized.net/'), video_id, + note='Checking geo-restriction', fatal=False, expected_status=403) + if urlh: + error_reasons = urlh.headers.get_all('x-error-reason') or [] + if 'geo-blocked' in error_reasons: + self.raise_geo_restricted(countries=['AU']) + self.raise_no_formats('No formats are available', video_id=video_id) - urls = player_params['releaseUrls'] - theplatform_url = (urls.get('progressive') or urls.get('html') - or urls.get('standard') or player_params['relatedItemsURL']) + media = traverse_obj(self._download_json( + f'{self._PLAYER_API}/video_stream', video_id, fatal=False, + query={'id': video_id, 'context': 'tv'}), ('video_object', {dict})) or {} + + media.update(self._download_json( + f'https://catalogue.pr.sbsod.com/mpx-media/{video_id}', + video_id, fatal=not media) or {}) + + # For named episodes, use the catalogue's title to set episode, rather than generic 'Episode N'. + if traverse_obj(media, ('partOfSeries', {dict})): + media['epName'] = traverse_obj(media, ('title', {str})) return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', 'id': video_id, - 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), - 'is_live': player_params.get('streamType') == 'live', + **traverse_obj(media, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'channel': ('taxonomy', 'channel', 'name', {str}), + 'series': ((('partOfSeries', 'name'), 'seriesTitle'), {str}), + 'series_id': ((('partOfSeries', 'uuid'), 'seriesID'), {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('epName', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'timestamp': (('datePublished', ('publication', 'startDate')), {parse_iso8601}), + 'release_year': ('releaseYear', {int_or_none}), + 'duration': ('duration', ({float_or_none}, {parse_duration})), + 'is_live': ('liveStream', {bool}), + 'age_limit': ( + ('classificationID', 'contentRating'), {str.upper}, {self._AUS_TV_PARENTAL_GUIDELINES.get}), + }, get_all=False), + **traverse_obj(media, { + 'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}), + 'tags': (('consumerAdviceTexts', ('sbsSubCertification', 'consumerAdvice')), ..., {str}), + 'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['contentUrl']), { + 'id': ('name', {str}), + 'url': 'contentUrl', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + 'formats': formats, + 'subtitles': subtitles, + 'uploader': 'SBSC', } diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 40533c2cb4..746a2885d6 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -4093,6 +4093,10 @@ def data(self, data): def close(self): return self._out.strip() + # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870 + # This will not trigger false positives since only UTF-8 text is being replaced + dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'') + def parse_node(node): target = TTMLPElementParser() parser = xml.etree.ElementTree.XMLParser(target=target) From 8f0be90ecb3b8d862397177bb226f17b245ef933 Mon Sep 17 00:00:00 2001 From: garret <76261416+garret1317@users.noreply.github.com> Date: Wed, 19 Apr 2023 05:21:24 +0100 Subject: [PATCH 052/501] [extractor/nhk] Add `NhkRadiru` extractor (#6819) * Add `NhkRadioNewsPage` extractor Authored by: garret1317 --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/nhk.py | 140 +++++++++++++++++++++++++++++++- 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index deb92b5fce..58137d7f6e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1232,6 +1232,8 @@ NhkForSchoolBangumiIE, NhkForSchoolSubjectIE, NhkForSchoolProgramListIE, + NhkRadioNewsPageIE, + NhkRadiruIE, ) from .nhl import NHLIE from .nick import ( diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 59702b247e..1597962acf 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -6,7 +6,8 @@ traverse_obj, unescapeHTML, unified_timestamp, - urljoin + urljoin, + url_or_none ) @@ -334,3 +335,140 @@ def _real_extract(self, url): for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []] return self.playlist_result(bangumis, program_id, title, description) + + +class NhkRadiruIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + IE_DESC = 'NHK らじる (Radiru/Rajiru)' + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P[\da-zA-Z]+)_(?P[\da-zA-Z]+)(?:_(?P[\da-zA-Z]+))?' + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544', + 'skip': 'Episode expired on 2023-04-16', + 'info_dict': { + 'channel': 'NHK-FM', + 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9', + 'ext': 'm4a', + 'id': '0449_01_3853544', + 'series': 'ジャズ・トゥナイト', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', + 'timestamp': 1680969600, + 'title': 'ジャズ・トゥナイト NEWジャズ特集', + 'upload_date': '20230408', + 'release_timestamp': 1680962400, + 'release_date': '20230408', + 'was_live': True, + }, + }, { + # playlist, airs every weekday so it should _hopefully_ be okay forever + 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01', + 'info_dict': { + 'id': '0458_01', + 'title': 'ベストオブクラシック', + 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', + 'channel': 'NHK-FM', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', + }, + 'playlist_mincount': 3, + }, { + # one with letters in the id + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470', + 'note': 'Expires on 2024-03-31', + 'info_dict': { + 'id': 'F300_06_3738470', + 'ext': 'm4a', + 'title': '有島武郎「一房のぶどう」', + 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)', + 'channel': 'NHKラジオ第1、NHK-FM', + 'timestamp': 1635757200, + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', + 'release_date': '20161207', + 'series': 'らじる文庫 by ラジオ深夜便 ', + 'release_timestamp': 1481126700, + 'upload_date': '20211101', + } + }, { + # news + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109', + 'skip': 'Expires on 2023-04-17', + 'info_dict': { + 'id': 'F261_01_3855109', + 'ext': 'm4a', + 'channel': 'NHKラジオ第1', + 'timestamp': 1681635900, + 'release_date': '20230416', + 'series': 'NHKラジオニュース', + 'title': '午後6時のNHKニュース', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'upload_date': '20230416', + 'release_timestamp': 1681635600, + }, + }] + + def _extract_episode_info(self, headline, programme_id, series_meta): + episode_id = f'{programme_id}_{headline["headline_id"]}' + episode = traverse_obj(headline, ('file_list', 0, {dict})) + + return { + **series_meta, + 'id': episode_id, + 'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False), + 'container': 'm4a_dash', # force fixup, AAC-only HLS + 'was_live': True, + 'series': series_meta.get('title'), + 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'), + **traverse_obj(episode, { + 'title': 'file_title', + 'description': 'file_title_sub', + 'timestamp': ('open_time', {unified_timestamp}), + 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}), + }), + } + + def _real_extract(self, url): + site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') + programme_id = f'{site_id}_{corner_id}' + + if site_id == 'F261': + json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json' + else: + json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json' + + meta = self._download_json(json_url, programme_id)['main'] + + series_meta = traverse_obj(meta, { + 'title': 'program_name', + 'channel': 'media_name', + 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), + }, get_all=False) + + if headline_id: + return self._extract_episode_info( + traverse_obj(meta, ( + 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False), + programme_id, series_meta) + + def entries(): + for headline in traverse_obj(meta, ('detail_list', ..., {dict})): + yield self._extract_episode_info(headline, programme_id, series_meta) + + return self.playlist_result( + entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta) + + +class NhkRadioNewsPageIE(InfoExtractor): + _VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])' + _TESTS = [{ + # airs daily, on-the-hour most hours + 'url': 'https://www.nhk.or.jp/radionews/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'F261_01', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', + 'channel': 'NHKラジオ第1', + 'title': 'NHKラジオニュース', + } + }] + + def _real_extract(self, url): + return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE) From 1ea15603d852971ed7d92f4de12808b27b3d9370 Mon Sep 17 00:00:00 2001 From: truedread Date: Fri, 21 Apr 2023 20:11:51 -0400 Subject: [PATCH 053/501] [extractor/wevidi] Add extractor (#6868) Closes #6129 Authored by: truedread --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/wevidi.py | 108 ++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 yt_dlp/extractor/wevidi.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 58137d7f6e..a81682e437 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2298,6 +2298,7 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .wevidi import WeVidiIE from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE diff --git a/yt_dlp/extractor/wevidi.py b/yt_dlp/extractor/wevidi.py new file mode 100644 index 0000000000..3b6d03238f --- /dev/null +++ b/yt_dlp/extractor/wevidi.py @@ -0,0 +1,108 @@ +from .common import InfoExtractor +from ..utils import clean_html, float_or_none, get_element_by_class, js_to_json, traverse_obj + + +class WeVidiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?wevidi\.net/watch/(?P[\w-]{11})' + _TESTS = [{ + 'url': 'https://wevidi.net/watch/2th7UO5F4KV', + 'md5': 'b913d1ff5bbad499e2c7ef4aa6d829d7', + 'info_dict': { + 'id': '2th7UO5F4KV', + 'ext': 'mp4', + 'title': 'YouTube Alternative: WeVidi - customizable channels & more', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:73a27d0a87d49fbcc5584566326ebeed', + 'uploader': 'eclecRC', + 'duration': 932.098, + } + }, { + 'url': 'https://wevidi.net/watch/ievRuuQHbPS', + 'md5': 'ce8a94989a959bff9003fa27ee572935', + 'info_dict': { + 'id': 'ievRuuQHbPS', + 'ext': 'mp4', + 'title': 'WeVidi Playlists', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:32cdfca272687390d9bd9b0c9c6153ee', + 'uploader': 'WeVidi', + 'duration': 36.1999, + } + }, { + 'url': 'https://wevidi.net/watch/PcMzDWaQSWb', + 'md5': '55ee0d3434be5d9e5cc76b83f2bb57ec', + 'info_dict': { + 'id': 'PcMzDWaQSWb', + 'ext': 'mp4', + 'title': 'Cat blep', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e2c9e2b54b8bb424cc64937c8fdc068f', + 'uploader': 'WeVidi', + 'duration': 41.972, + } + }, { + 'url': 'https://wevidi.net/watch/wJnRqDHNe_u', + 'md5': 'c8f263dd47e66cc17546b3abf47b5a77', + 'info_dict': { + 'id': 'wJnRqDHNe_u', + 'ext': 'mp4', + 'title': 'Gissy Talks: YouTube Alternatives', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e65036f0d4af80e0af191bd11af5195e', + 'uploader': 'GissyEva', + 'duration': 630.451, + } + }, { + 'url': 'https://wevidi.net/watch/4m1c4yJR_yc', + 'md5': 'c63ce5ca6990dce86855fc02ca5bc1ed', + 'info_dict': { + 'id': '4m1c4yJR_yc', + 'ext': 'mp4', + 'title': 'Enough of that! - Awesome Exilez Podcast', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:96af99dd63468b2dfab3020560e3e9b2', + 'uploader': 'eclecRC', + 'duration': 6.804, + } + }] + + def _extract_formats(self, wvplayer_props): + # Taken from WeVidi player JS: https://wevidi.net/layouts/default/static/player.min.js + resolution_map = { + 1: 144, + 2: 240, + 3: 360, + 4: 480, + 5: 720, + 6: 1080 + } + + src_path = f'{wvplayer_props["srcVID"]}/{wvplayer_props["srcUID"]}/{wvplayer_props["srcNAME"]}' + for res in traverse_obj(wvplayer_props, ('resolutions', ..., {int}, {lambda x: x or None})): + format_id = str(-(res // -2) - 1) + yield { + 'acodec': 'mp4a.40.2', + 'ext': 'mp4', + 'format_id': format_id, + 'height': resolution_map.get(res), + 'url': f'https://www.wevidi.net/videoplayback/{src_path}/{format_id}', + 'vcodec': 'avc1.42E01E', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + wvplayer_props = self._search_json( + r'WVPlayer\(', webpage, 'player', video_id, + transform_source=lambda x: js_to_json(x.replace('||', '}'))) + + return { + 'id': video_id, + 'title': clean_html(get_element_by_class('video_title', webpage)), + 'description': clean_html(get_element_by_class('descr_long', webpage)), + 'uploader': clean_html(get_element_by_class('username', webpage)), + 'formats': list(self._extract_formats(wvplayer_props)), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': float_or_none(wvplayer_props.get('duration')), + } From 80b732b7a9585b2a61e456dc0d2d014a439cbaee Mon Sep 17 00:00:00 2001 From: JC-Chung <52159296+JC-Chung@users.noreply.github.com> Date: Sun, 23 Apr 2023 07:25:04 +0800 Subject: [PATCH 054/501] [extractor/twitch] Extract original size thumbnail (#6629) Authored by: JC-Chung --- yt_dlp/extractor/twitch.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 6321297bb1..9b333f6f67 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -179,6 +179,14 @@ def _download_access_token(self, video_id, token_kind, param_name): video_id, ops, 'Downloading %s access token GraphQL' % token_kind)['data'][method] + def _get_thumbnails(self, thumbnail): + return [{ + 'url': re.sub(r'\d+x\d+(\.\w+)($|(?=[?#]))', r'0x0\g<1>', thumbnail), + 'preference': 1, + }, { + 'url': thumbnail, + }] if thumbnail else None + class TwitchVodIE(TwitchBaseIE): IE_NAME = 'twitch:vod' @@ -460,15 +468,13 @@ def _extract_info_gql(self, info, item_id): is_live, thumbnail = True, None else: is_live = False - for p in ('width', 'height'): - thumbnail = thumbnail.replace('{%s}' % p, '0') return { 'id': vod_id, 'title': info.get('title') or 'Untitled Broadcast', 'description': info.get('description'), 'duration': int_or_none(info.get('lengthSeconds')), - 'thumbnail': thumbnail, + 'thumbnails': self._get_thumbnails(thumbnail), 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str), 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), 'timestamp': unified_timestamp(info.get('publishedAt')), @@ -1053,7 +1059,7 @@ def _real_extract(self, url): 'display_id': channel_name, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': self._get_thumbnails(thumbnail), 'uploader': uploader, 'uploader_id': channel_name, 'timestamp': timestamp, From 78fde6e3398ff11e5d383a66b28664badeab5180 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 24 Apr 2023 17:21:20 +0530 Subject: [PATCH 055/501] [outtmpl] Allow `\n` in replacements and default. Fixes: https://github.com/yt-dlp/yt-dlp/issues/6808#issuecomment-1510055357 Fixes: https://github.com/yt-dlp/yt-dlp/issues/6808#issuecomment-1510363645 --- test/test_YoutubeDL.py | 1 + yt_dlp/YoutubeDL.py | 2 +- yt_dlp/options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8da1e5e4b2..49ae9e2b12 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -822,6 +822,7 @@ def expect_same_infodict(out): test('%(title&foo|baz)s.bar', 'baz.bar') test('%(x,id&foo|baz)s.bar', 'foo.bar') test('%(x,title&foo|baz)s.bar', 'baz.bar') + test('%(title&\n|)s', '\n') # Laziness def gen(): diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 31f7645dca..61c149e475 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1156,7 +1156,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): } MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) - INTERNAL_FORMAT_RE = re.compile(rf'''(?x) + INTERNAL_FORMAT_RE = re.compile(rf'''(?xs) (?P-)? (?P{FIELD_RE}) (?P(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 84aeda7f12..d334a9caaa 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -243,7 +243,7 @@ def _dict_from_options_callback( if multiple_keys: allowed_keys = fr'({allowed_keys})(,({allowed_keys}))*' mobj = re.match( - fr'(?i)(?P{allowed_keys}){delimiter}(?P.*)$', + fr'(?is)(?P{allowed_keys}){delimiter}(?P.*)$', value[0] if multiple_args else value) if mobj is not None: keys, val = mobj.group('keys').split(','), mobj.group('val') From ec9311c41b111110bc52cfbd6ea682c6fb23f77a Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 24 Apr 2023 18:31:36 +0530 Subject: [PATCH 056/501] [outtmpl] Support `str.format` syntax inside replacements Closes #6843 --- README.md | 2 +- test/test_YoutubeDL.py | 5 ++++- yt_dlp/YoutubeDL.py | 18 ++++++++++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 35229f728e..efb490ab1b 100644 --- a/README.md +++ b/README.md @@ -1246,7 +1246,7 @@ # OUTPUT TEMPLATE 1. **Alternatives**: Alternate fields can be specified separated with a `,`. E.g. `%(release_date>%Y,upload_date>%Y|Unknown)s` -1. **Replacement**: A replacement value can be specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. +1. **Replacement**: A replacement value can be specified using a `&` separator according to the [`str.format` mini-language](https://docs.python.org/3/library/string.html#format-specification-mini-language). If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. E.g. `%(chapters&has chapters|no chapters)s`, `%(title&TITLE={:>20}|NO TITLE)s` 1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s` diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 49ae9e2b12..3c26bd7c65 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -822,7 +822,10 @@ def expect_same_infodict(out): test('%(title&foo|baz)s.bar', 'baz.bar') test('%(x,id&foo|baz)s.bar', 'foo.bar') test('%(x,title&foo|baz)s.bar', 'baz.bar') - test('%(title&\n|)s', '\n') + test('%(id&a\nb|)s', ('a\nb', 'a b')) + test('%(id&hi {:>10} {}|)s', 'hi 1234 1234') + test(R'%(id&{0} {}|)s', 'NA') + test(R'%(id&{0.1}|)s', 'NA') # Laziness def gen(): diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 61c149e475..dce6cf928c 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -21,7 +21,7 @@ import traceback import unicodedata import urllib.request -from string import ascii_letters +from string import Formatter, ascii_letters from .cache import Cache from .compat import compat_os_name, compat_shlex_quote @@ -1237,6 +1237,14 @@ def _dumpjson_default(obj): return list(obj) return repr(obj) + class _ReplacementFormatter(Formatter): + def get_field(self, field_name, args, kwargs): + if field_name.isdigit(): + return args[0], -1 + raise ValueError('Unsupported field') + + replacement_formatter = _ReplacementFormatter() + def create_key(outer_mobj): if not outer_mobj.group('has_key'): return outer_mobj.group(0) @@ -1258,7 +1266,13 @@ def create_key(outer_mobj): if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = f'0{field_size_compat_map[key]:d}d' - value = default if value is None else value if replacement is None else replacement + if value is None: + value = default + elif replacement is not None: + try: + value = replacement_formatter.format(replacement, value) + except ValueError: + value = na flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' From d669772c65e8630162fd6555d0a578b246591921 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 24 Apr 2023 18:52:09 +0530 Subject: [PATCH 057/501] Add `--no-quiet` Closes #6796 --- README.md | 1 + yt_dlp/__init__.py | 3 ++- yt_dlp/options.py | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index efb490ab1b..ef0c236b5f 100644 --- a/README.md +++ b/README.md @@ -752,6 +752,7 @@ ## Internet Shortcut Options: ## Verbosity and Simulation Options: -q, --quiet Activate quiet mode. If used with --verbose, print the log to stderr + --no-quiet Deactivate quiet mode. (Default) --no-warnings Ignore warnings -s, --simulate Do not download the video and do not write anything to disk diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index bdac1212c6..79b9a7679f 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -704,7 +704,8 @@ def parse_options(argv=None): 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename', 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl' )) - opts.quiet = opts.quiet or any_getting or opts.print_json or bool(opts.forceprint) + if opts.quiet is None: + opts.quiet = any_getting or opts.print_json or bool(opts.forceprint) playlist_pps = [pp for pp in postprocessors if pp.get('when') == 'playlist'] write_playlist_infojson = (opts.writeinfojson and not opts.clean_infojson diff --git a/yt_dlp/options.py b/yt_dlp/options.py index d334a9caaa..a2f508552d 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1079,8 +1079,12 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options') verbosity.add_option( '-q', '--quiet', - action='store_true', dest='quiet', default=False, + action='store_true', dest='quiet', default=None, help='Activate quiet mode. If used with --verbose, print the log to stderr') + verbosity.add_option( + '--no-quiet', + action='store_false', dest='quiet', + help='Deactivate quiet mode. (Default)') verbosity.add_option( '--no-warnings', dest='no_warnings', action='store_true', default=False, From 04f8018a0544736a18494bc3899d06b05b78fae6 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 24 Apr 2023 18:59:07 +0530 Subject: [PATCH 058/501] [extractor/hentaistigma] Remove extractor Piracy site Closes #6870 --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/hentaistigma.py | 37 -------------------------------- 2 files changed, 38 deletions(-) delete mode 100644 yt_dlp/extractor/hentaistigma.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a81682e437..750708d77e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -721,7 +721,6 @@ from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE from .hketv import HKETVIE from .hidive import HiDiveIE diff --git a/yt_dlp/extractor/hentaistigma.py b/yt_dlp/extractor/hentaistigma.py deleted file mode 100644 index ca5ffc2aea..0000000000 --- a/yt_dlp/extractor/hentaistigma.py +++ /dev/null @@ -1,37 +0,0 @@ -from .common import InfoExtractor - - -class HentaiStigmaIE(InfoExtractor): - _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P[^/]+)' - _TEST = { - 'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/', - 'md5': '4e3d07422a68a4cc363d8f57c8bf0d23', - 'info_dict': { - 'id': 'inyouchuu-etsu-bonus', - 'ext': 'mp4', - 'title': 'Inyouchuu Etsu Bonus', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r']+class="posttitle"[^>]*>]*>([^<]+)', - webpage, 'title') - wrap_url = self._html_search_regex( - r']+src="([^"]+mp4)"', webpage, 'wrapper url') - wrap_webpage = self._download_webpage(wrap_url, video_id) - - video_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url') - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'age_limit': 18, - } From c16644642b08e2bf4130a6c5fa01395d8718c990 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 24 Apr 2023 19:38:58 +0530 Subject: [PATCH 059/501] Add option `--xff` Deprecates `--geo-bypass`, `--no-geo-bypass, `--geo-bypass-country`, `--geo-bypass-ip-block` --- README.md | 18 +++++++++--------- yt_dlp/__init__.py | 13 +++++++++---- yt_dlp/options.py | 25 +++++++++++++++---------- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index ef0c236b5f..47da19011f 100644 --- a/README.md +++ b/README.md @@ -463,15 +463,11 @@ ## Geo-restriction: specified by --proxy (or none, if the option is not present) is used for the actual downloading - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (default) - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR notation + --xff VALUE How to fake X-Forwarded-For HTTP header to + try bypassing geographic restriction. One of + "default" (Only when known to be useful), + "never", a two-letter ISO 3166-2 country + code, or an IP block in CIDR notation ## Video Selection: -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items @@ -2168,6 +2164,10 @@ #### Not recommended --youtube-skip-hls-manifest --extractor-args "youtube:skip=hls" (Alias: --no-youtube-include-hls-manifest) --youtube-include-dash-manifest Default (Alias: --no-youtube-skip-dash-manifest) --youtube-include-hls-manifest Default (Alias: --no-youtube-skip-hls-manifest) + --geo-bypass --xff "default" + --no-geo-bypass --xff "never" + --geo-bypass-country CODE --xff CODE + --geo-bypass-ip-block IP_BLOCK --xff IP_BLOCK #### Developer options diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 79b9a7679f..47ee3cc02f 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -396,12 +396,17 @@ def metadataparser_actions(f): except Exception as err: raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') - geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country - if geo_bypass_code is not None: + opts.geo_bypass_country, opts.geo_bypass_ip_block = None, None + if opts.geo_bypass.lower() not in ('default', 'never'): try: - GeoUtils.random_ipv4(geo_bypass_code) + GeoUtils.random_ipv4(opts.geo_bypass) except Exception: - raise ValueError('unsupported geo-bypass country or ip-block') + raise ValueError(f'Unsupported --xff "{opts.geo_bypass}"') + if len(opts.geo_bypass) == 2: + opts.geo_bypass_country = opts.geo_bypass + else: + opts.geo_bypass_ip_block = opts.geo_bypass + opts.geo_bypass = opts.geo_bypass.lower() != 'never' opts.match_filter = match_filter_func(opts.match_filter, opts.breaking_match_filter) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index a2f508552d..362a648cdd 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -519,22 +519,27 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', help=optparse.SUPPRESS_HELP) + geo.add_option( + '--xff', metavar='VALUE', + dest='geo_bypass', default="default", + help=( + 'How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. ' + 'One of "default" (Only when known to be useful), "never", ' + 'a two-letter ISO 3166-2 country code, or an IP block in CIDR notation')) geo.add_option( '--geo-bypass', - action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (default)') + action='store_const', dest='geo_bypass', const='default', + help=optparse.SUPPRESS_HELP) geo.add_option( '--no-geo-bypass', - action='store_false', dest='geo_bypass', - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') + action='store_const', dest='geo_bypass', const='never', + help=optparse.SUPPRESS_HELP) geo.add_option( - '--geo-bypass-country', metavar='CODE', - dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') + '--geo-bypass-country', metavar='CODE', dest='geo_bypass', + help=optparse.SUPPRESS_HELP) geo.add_option( - '--geo-bypass-ip-block', metavar='IP_BLOCK', - dest='geo_bypass_ip_block', default=None, - help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') + '--geo-bypass-ip-block', metavar='IP_BLOCK', dest='geo_bypass', + help=optparse.SUPPRESS_HELP) selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( From 21b5ec86c2c37d10c5bb97edd7051d3aac16bb3e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 24 Apr 2023 19:56:35 +0200 Subject: [PATCH 060/501] [utils] `traverse_obj`: Allow iterables in traversal (#6902) Authored by: Grub4K --- test/test_utils.py | 4 ++++ yt_dlp/utils.py | 7 +++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index d4a301583f..f2f3b8170a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2016,6 +2016,8 @@ def test_traverse_obj(self): msg='nested `...` queries should work') self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4), msg='`...` query result should be flattened') + self.assertEqual(traverse_obj(range(4), ...), list(range(4)), + msg='`...` should accept iterables') # Test function as key self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), @@ -2023,6 +2025,8 @@ def test_traverse_obj(self): msg='function as query key should perform a filter based on (key, value)') self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, msg='exceptions in the query function should be catched') + self.assertEqual(traverse_obj(range(4), lambda _, x: x % 2 == 0), [0, 2], + msg='function key should accept iterables') if __debug__: with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): traverse_obj(_TEST_DATA, lambda a: ...) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 746a2885d6..f69311462d 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5528,7 +5528,6 @@ def traverse_obj( If no `default` is given and the last path branches, a `list` of results is always returned. If a path ends on a `dict` that result will always be a `dict`. """ - is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes)) casefold = lambda k: k.casefold() if isinstance(k, str) else k if isinstance(expected_type, type): @@ -5564,7 +5563,7 @@ def apply_key(key, obj, is_last): branching = True if isinstance(obj, collections.abc.Mapping): result = obj.values() - elif is_sequence(obj): + elif isinstance(obj, collections.abc.Iterable) and not isinstance(obj, (str, bytes)): result = obj elif isinstance(obj, re.Match): result = obj.groups() @@ -5578,7 +5577,7 @@ def apply_key(key, obj, is_last): branching = True if isinstance(obj, collections.abc.Mapping): iter_obj = obj.items() - elif is_sequence(obj): + elif isinstance(obj, collections.abc.Iterable) and not isinstance(obj, (str, bytes)): iter_obj = enumerate(obj) elif isinstance(obj, re.Match): iter_obj = itertools.chain( @@ -5614,7 +5613,7 @@ def apply_key(key, obj, is_last): result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) elif isinstance(key, (int, slice)): - if is_sequence(obj): + if isinstance(obj, collections.abc.Sequence) and not isinstance(obj, (str, bytes)): branching = isinstance(key, slice) with contextlib.suppress(IndexError): result = obj[key] From 9b30cd3dfce83c2f0201b28a7a3ef44ab9722664 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Mon, 24 Apr 2023 13:16:22 -0600 Subject: [PATCH 061/501] [extractors/rtvc] Add extractors (#6578) * Add `RTVCPlay` extractor * Add `RTVCPlayEmbed` extractor * Add `RTVCKaltura` extractor * Add `SenalColombiaLive` extractor Closes #6457 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 6 + yt_dlp/extractor/rtvcplay.py | 285 ++++++++++++++++++++++++++++++ yt_dlp/extractor/senalcolombia.py | 31 ++++ 3 files changed, 322 insertions(+) create mode 100644 yt_dlp/extractor/rtvcplay.py create mode 100644 yt_dlp/extractor/senalcolombia.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 750708d77e..b82f52bca3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1621,6 +1621,11 @@ from .rtp import RTPIE from .rtrfm import RTRFMIE from .rts import RTSIE +from .rtvcplay import ( + RTVCPlayIE, + RTVCPlayEmbedIE, + RTVCKalturaIE, +) from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, @@ -1690,6 +1695,7 @@ ) from .scrolller import ScrolllerIE from .seeker import SeekerIE +from .senalcolombia import SenalColombiaLiveIE from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE diff --git a/yt_dlp/extractor/rtvcplay.py b/yt_dlp/extractor/rtvcplay.py new file mode 100644 index 0000000000..741c472621 --- /dev/null +++ b/yt_dlp/extractor/rtvcplay.py @@ -0,0 +1,285 @@ +import re + +from .common import InfoExtractor, ExtractorError +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + float_or_none, + js_to_json, + mimetype2ext, + traverse_obj, + urljoin, + url_or_none, +) + + +class RTVCPlayBaseIE(InfoExtractor): + _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co' + + def _extract_player_config(self, webpage, video_id): + return self._search_json( + r']*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage), + 'player_config', video_id, transform_source=js_to_json) + + def _extract_formats_and_subtitles_player_config(self, player_config, video_id): + formats, subtitles = [], {} + for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))): + ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url'])) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + source['url'], video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': source['url'], + 'ext': ext, + }) + + return formats, subtitles + + +class RTVCPlayIE(RTVCPlayBaseIE): + _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P(?!embed)[^/]+)/(?:[^?#]+/)?(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional', + 'info_dict': { + 'id': 'canal-institucional', + 'title': r're:^Canal Institucional', + 'description': 'md5:eff9e548394175928059320c006031ea', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia', + 'info_dict': { + 'id': 'senal-colombia', + 'title': r're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional', + 'info_dict': { + 'id': 'radio-nacional', + 'title': r're:^Radio Nacional', + 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas', + 'md5': '1288ee6f6d1330d880f98bff2ed710a3', + 'info_dict': { + 'id': 'senoritas', + 'title': 'Señoritas', + 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022', + 'md5': 'f040a7380a269ad633cf837384d5e9fc', + 'info_dict': { + 'id': 'james-regresa-clases-28022022', + 'title': 'James regresa a clases - 28/02/2022', + 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo', + 'info_dict': { + 'id': 'llinas-el-cerebro-y-el-universo', + 'title': 'Llinás, el cerebro y el universo', + 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa', + 'info_dict': { + 'id': 'profe-en-tu-casa', + 'title': 'Profe en tu casa', + 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 537, + }, { + 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura', + 'info_dict': { + 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura', + 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura', + 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones', + 'info_dict': { + 'id': 'diez-versiones', + 'title': 'Diez versiones', + 'description': 'md5:997471ed971cb3fd8e41969457675306', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 20, + }] + + def _real_extract(self, url): + video_id, category = self._match_valid_url(url).group('id', 'category') + webpage = self._download_webpage(url, video_id) + + hydration = self._search_json( + r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration', + video_id, transform_source=js_to_json)['content']['currentContent'] + + asset_id = traverse_obj(hydration, ('video', 'assetid')) + if asset_id: + hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id) + else: + hls_url = traverse_obj(hydration, ('channel', 'hls')) + + metadata = traverse_obj(hydration, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'), + }, get_all=False) + + # Probably it's a program's page + if not hls_url: + seasons = traverse_obj( + hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'), + get_all=False) + if not seasons: + podcast_episodes = hydration.get('audios') + if not podcast_episodes: + raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes') + + return self.playlist_result([ + self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}), + 'season_number': ('season', {int_or_none}), + })) for episode in podcast_episodes], video_id, **metadata) + + entries = [self.url_result( + urljoin(url, episode['slug']), url_transparent=True, + **traverse_obj(season, { + 'season': 'title', + 'season_number': ('season', {int_or_none}), + }), **traverse_obj(episode, { + 'title': 'title', + 'thumbnail': ('image', 'cover', 'path'), + 'episode_number': ('chapter_number', {int_or_none}), + })) for season in seasons for episode in traverse_obj(season, ('contents', ...))] + + return self.playlist_result(entries, video_id, **metadata) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': category == 'en-vivo', + **metadata, + } + + +class RTVCPlayEmbedIE(RTVCPlayBaseIE): + _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9', + 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8', + 'info_dict': { + 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9', + 'title': 'Tráiler: Señoritas', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_config = self._extract_player_config(webpage, video_id) + formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id) + + asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid')) + metadata = {} if not asset_id else self._download_json( + f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('image', ..., 'thumbnail', 'path'), + }, get_all=False) + } + + +class RTVCKalturaIE(RTVCPlayBaseIE): + _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html', + 'info_dict': { + 'id': 'indexSC', + 'title': r're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_config = self._extract_player_config(webpage, video_id) + formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id) + + channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId')) + metadata = {} if not channel_id else self._download_json( + f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('channel', 'image', 'logo', 'path'), + }) + } diff --git a/yt_dlp/extractor/senalcolombia.py b/yt_dlp/extractor/senalcolombia.py new file mode 100644 index 0000000000..f3c066da77 --- /dev/null +++ b/yt_dlp/extractor/senalcolombia.py @@ -0,0 +1,31 @@ +from .common import InfoExtractor +from .rtvcplay import RTVCKalturaIE + + +class SenalColombiaLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?Psenal-en-vivo)' + + _TESTS = [{ + 'url': 'https://www.senalcolombia.tv/senal-en-vivo', + 'info_dict': { + 'id': 'indexSC', + 'title': 're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + hydration = self._search_json( + r']*data-drupal-selector\s*=\s*"[^"]*drupal-settings-json[^"]*"[^>]*>', + webpage, 'hydration', display_id) + + return self.url_result(hydration['envivosrc'], RTVCKalturaIE, display_id) From c86e433c35fe5da6cb29f3539eef97497f84ed38 Mon Sep 17 00:00:00 2001 From: sqrtNOT <77981959+sqrtNOT@users.noreply.github.com> Date: Tue, 25 Apr 2023 10:21:06 +0000 Subject: [PATCH 062/501] [extractor/NiconicoSeries] Fix extraction (#6898) Authored by: sqrtNOT --- yt_dlp/extractor/niconico.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index cacefeb429..30b4d7216f 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -660,10 +660,10 @@ def _real_extract(self, url): class NiconicoSeriesIE(InfoExtractor): IE_NAME = 'niconico:series' - _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P\d+)' + _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp(?:/user/\d+)?|nico\.ms)/series/(?P\d+)' _TESTS = [{ - 'url': 'https://www.nicovideo.jp/series/110226', + 'url': 'https://www.nicovideo.jp/user/44113208/series/110226', 'info_dict': { 'id': '110226', 'title': 'ご立派ァ!のシリーズ', @@ -683,7 +683,7 @@ class NiconicoSeriesIE(InfoExtractor): def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id) + webpage = self._download_webpage(url, list_id) title = self._search_regex( (r'「(.+)(全', @@ -691,10 +691,9 @@ def _real_extract(self, url): webpage, 'title', fatal=False) if title: title = unescapeHTML(title) - playlist = [ - self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id) - for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)] - return self.playlist_result(playlist, list_id, title) + json_data = next(self._yield_json_ld(webpage, None, fatal=False)) + return self.playlist_from_matches( + traverse_obj(json_data, ('itemListElement', ..., 'url')), list_id, title, ie=NiconicoIE) class NiconicoHistoryIE(NiconicoPlaylistBaseIE): From 0c4e0fbcade0fc92d14c2a6d63e360fe067f6192 Mon Sep 17 00:00:00 2001 From: Neurognostic <donovan@tremura.email> Date: Tue, 25 Apr 2023 12:13:54 -0400 Subject: [PATCH 063/501] [extractor/bitchute] Add more fallback subdomains (#6907) Authored by: Neurognostic --- yt_dlp/extractor/bitchute.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 10e7b0b2bb..a6779505e5 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -77,7 +77,10 @@ class BitChuteIE(InfoExtractor): def _check_format(self, video_url, video_id): urls = orderedSet( re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) - for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128', + 'seed132', 'seed150', 'seed151', 'seed152', 'seed153', + 'seed167', 'seed171', 'seed177', 'seed305', 'seed307', + 'seedp29xb', 'zb10-7gsop1v78')) for url in urls: try: response = self._request_webpage( From 62beefa818c75c20b6941389bb197051554a5d41 Mon Sep 17 00:00:00 2001 From: Noah <nkempers@outlook.de> Date: Tue, 25 Apr 2023 22:46:14 +0200 Subject: [PATCH 064/501] [extractor/pornhub] Set access cookies to fix extraction (#6685) Closes #4299 Authored by: Schmoaaaaah, arobase-che Co-authored-by: Noah <nkempers@outlook.de> Co-authored-by: ache <ache@ache.one> --- yt_dlp/extractor/pornhub.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 5d8d7c100a..2f5a572a5b 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -58,6 +58,11 @@ def dl(*args, **kwargs): def _real_initialize(self): self._logged_in = False + def _set_age_cookies(self, host): + self._set_cookie(host, 'age_verified', '1') + self._set_cookie(host, 'accessAgeDisclaimerPH', '1') + self._set_cookie(host, 'accessPH', '1') + def _login(self, host): if self._logged_in: return @@ -267,8 +272,7 @@ def _real_extract(self, url): video_id = mobj.group('id') self._login(host) - - self._set_cookie(host, 'age_verified', '1') + self._set_age_cookies(host) def dl_webpage(platform): self._set_cookie(host, 'platform', platform) @@ -569,6 +573,7 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) user_id = mobj.group('id') videos_url = '%s/videos' % mobj.group('url') + self._set_age_cookies(mobj.group('host')) page = self._extract_page(url) if page: videos_url = update_url_query(videos_url, {'page': page}) @@ -633,6 +638,7 @@ def _real_extract(self, url): item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(url, host, item_id), item_id) @@ -812,5 +818,6 @@ def _real_extract(self, url): item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id) From ed81b74802b4247ee8d9dc0ef87eb52baefede1c Mon Sep 17 00:00:00 2001 From: Alex Klapheke <alexklapheke@gmail.com> Date: Wed, 26 Apr 2023 02:53:07 -0400 Subject: [PATCH 065/501] [extractor/aeonco] Support Youtube embeds (#6591) Authored by: alexklapheke --- yt_dlp/extractor/aeonco.py | 52 +++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/aeonco.py b/yt_dlp/extractor/aeonco.py index 4655862e3f..390eae32bf 100644 --- a/yt_dlp/extractor/aeonco.py +++ b/yt_dlp/extractor/aeonco.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from .vimeo import VimeoIE +from ..utils import ExtractorError, traverse_obj, url_or_none class AeonCoIE(InfoExtractor): @@ -19,22 +20,55 @@ class AeonCoIE(InfoExtractor): } }, { 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', - 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'md5': '03582d795382e49f2fd0b427b55de409', 'info_dict': { - 'id': '728595228', + 'id': '759576926', 'ext': 'mp4', 'title': 'Wrought', - 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', - 'uploader': 'Biofilm Productions', - 'uploader_id': 'user140352216', - 'uploader_url': 'https://vimeo.com/user140352216', + 'thumbnail': 'https://i.vimeocdn.com/video/1525599692-84614af88e446612f49ca966cf8f80eab2c73376bedd80555741c521c26f9a3e-d_1280', + 'uploader': 'Aeon Video', + 'uploader_id': 'aeonvideo', + 'uploader_url': 'https://vimeo.com/aeonvideo', 'duration': 1344 } + }, { + 'url': 'https://aeon.co/videos/chew-over-the-prisoners-dilemma-and-see-if-you-can-find-the-rational-path-out', + 'md5': '1cfda0bf3ae24df17d00f2c0cb6cc21b', + 'info_dict': { + 'id': 'emyi4z-O0ls', + 'ext': 'mp4', + 'title': 'How to outsmart the Prisoner’s Dilemma - Lucas Husted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/emyi4z-O0ls/maxresdefault.webp', + 'uploader': 'TED-Ed', + 'uploader_id': '@TEDEd', + 'uploader_url': 'https://www.youtube.com/@TEDEd', + 'duration': 344, + 'upload_date': '20200827', + 'channel_id': 'UCsooa4yRKGN_zEE8iknghZA', + 'playable_in_embed': True, + 'description': 'md5:c0959524f08cb60f96fd010f3dfb17f3', + 'categories': ['Education'], + 'like_count': int, + 'channel': 'TED-Ed', + 'chapters': 'count:7', + 'channel_url': 'https://www.youtube.com/channel/UCsooa4yRKGN_zEE8iknghZA', + 'tags': 'count:26', + 'availability': 'public', + 'channel_follower_count': int, + 'view_count': int, + 'age_limit': 0, + 'live_status': 'not_live', + 'comment_count': int, + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - vimeo_id = self._search_regex(r'hosterId":\s*"(?P<id>[0-9]+)', webpage, 'vimeo id') - vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') - return self.url_result(vimeo_url, VimeoIE) + embed_url = traverse_obj(self._yield_json_ld(webpage, video_id), ( + lambda _, v: v['@type'] == 'VideoObject', 'embedUrl', {url_or_none}), get_all=False) + if not embed_url: + raise ExtractorError('No embed URL found in webpage') + if 'player.vimeo.com' in embed_url: + embed_url = VimeoIE._smuggle_referrer(embed_url, 'https://aeon.co/') + return self.url_result(embed_url) From 30647668a92a0ca5cd108776804baac0996bd9f7 Mon Sep 17 00:00:00 2001 From: garret <76261416+garret1317@users.noreply.github.com> Date: Thu, 27 Apr 2023 00:42:07 +0100 Subject: [PATCH 066/501] [extractor/globalplayer] Add extractors (#6903) Authored by: garret1317 --- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/globalplayer.py | 254 +++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100755 yt_dlp/extractor/globalplayer.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b82f52bca3..3b5ae63b1f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -685,6 +685,13 @@ from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE +from .globalplayer import ( + GlobalPlayerLiveIE, + GlobalPlayerLivePlaylistIE, + GlobalPlayerAudioIE, + GlobalPlayerAudioEpisodeIE, + GlobalPlayerVideoIE +) from .globo import ( GloboIE, GloboArticleIE, diff --git a/yt_dlp/extractor/globalplayer.py b/yt_dlp/extractor/globalplayer.py new file mode 100755 index 0000000000..e0c0d58fd4 --- /dev/null +++ b/yt_dlp/extractor/globalplayer.py @@ -0,0 +1,254 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + join_nonempty, + parse_duration, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + urlhandle_detect_ext, +) + + +class GlobalPlayerBaseIE(InfoExtractor): + def _get_page_props(self, url, video_id): + webpage = self._download_webpage(url, video_id) + return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + + def _request_ext(self, url, video_id): + return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests + url, video_id, note='Determining source extension')) + + def _extract_audio(self, episode, series): + return { + 'vcodec': 'none', + **traverse_obj(series, { + 'series': 'title', + 'series_id': 'id', + 'thumbnail': 'imageUrl', + 'uploader': 'itunesAuthor', # podcasts only + }), + **traverse_obj(episode, { + 'id': 'id', + 'description': ('description', {clean_html}), + 'duration': ('duration', {parse_duration}), + 'thumbnail': 'imageUrl', + 'url': 'streamUrl', + 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}), + 'title': 'title', + }, get_all=False) + } + + +class GlobalPlayerLiveIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/live/smoothchill/uk/', + 'info_dict': { + 'id': '2mx1E', + 'ext': 'aac', + 'display_id': 'smoothchill-uk', + 'title': 're:^Smooth Chill.+$', + 'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png', + 'description': 'Music To Chill To', + 'live_status': 'is_live', + }, + }, { + # national station + 'url': 'https://www.globalplayer.com/live/heart/uk/', + 'info_dict': { + 'id': '2mwx4', + 'ext': 'aac', + 'description': 'turn up the feel good!', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'live_status': 'is_live', + 'title': 're:^Heart UK.+$', + 'display_id': 'heart-uk', + }, + }, { + # regional variation + 'url': 'https://www.globalplayer.com/live/heart/london/', + 'info_dict': { + 'id': 'AMqg', + 'ext': 'aac', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'title': 're:^Heart London.+$', + 'live_status': 'is_live', + 'display_id': 'heart-london', + 'description': 'turn up the feel good!', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['station'] + stream_url = station['streamUrl'] + + return { + 'id': station['id'], + 'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'), + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': (('name', 'brandName'), {str_or_none}), + 'description': 'tagline', + 'thumbnail': 'brandLogo', + }, get_all=False), + } + + +class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)' + _TESTS = [{ + # "live playlist" + 'url': 'https://www.globalplayer.com/playlists/8bLk/', + 'info_dict': { + 'id': '8bLk', + 'ext': 'aac', + 'live_status': 'is_live', + 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', + 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', + 'title': 're:^Classic FM Hall of Fame.+$' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['playlistData'] + stream_url = station['streamUrl'] + + return { + 'id': video_id, + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': 'title', + 'description': 'description', + 'thumbnail': 'image', + }), + } + + +class GlobalPlayerAudioIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/42KuaM/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '42KuaM', + 'title': 'Filthy Ritual', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'categories': ['Society & Culture', 'True Crime'], + 'uploader': 'Global', + 'description': 'md5:da5b918eac9ae319454a10a563afacf9', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/', + 'playlist_mincount': 3, + 'info_dict': { + 'id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + series = props['podcastInfo'] if podcast else props['catchupInfo'] + + return { + '_type': 'playlist', + 'id': video_id, + 'entries': [self._extract_audio(ep, series) for ep in traverse_obj( + series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], + 'categories': traverse_obj(series, ('categories', ..., 'name')) or None, + **traverse_obj(series, { + 'description': 'description', + 'thumbnail': 'imageUrl', + 'title': 'title', + 'uploader': 'itunesAuthor', # podcasts only + }), + } + + +class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/', + 'info_dict': { + 'id': '7DrfNnE', + 'ext': 'mp3', + 'title': 'Filthy Ritual - Trailer', + 'description': 'md5:1f1562fd0f01b4773b590984f94223e0', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'duration': 225.0, + 'timestamp': 1681254900, + 'series': 'Filthy Ritual', + 'series_id': '42KuaM', + 'upload_date': '20230411', + 'uploader': 'Global', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/', + 'info_dict': { + 'id': '2zGq26Vcv1fCWhddC4JAwETXWe', + 'ext': 'm4a', + 'timestamp': 1682056800, + 'series': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + 'upload_date': '20230421', + 'series_id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'duration': 10800.0, + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + episode = props['podcastEpisode'] if podcast else props['catchupEpisode'] + + return self._extract_audio( + episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {}) + + +class GlobalPlayerVideoIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/', + 'info_dict': { + 'id': '2JsSZ7Gm2uP', + 'ext': 'mp4', + 'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd', + 'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550', + 'upload_date': '20230420', + 'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._get_page_props(url, video_id)['videoData'] + + return { + 'id': video_id, + **traverse_obj(meta, { + 'url': 'url', + 'thumbnail': ('image', 'url'), + 'title': 'title', + 'upload_date': ('publish_date', {unified_strdate}), + 'description': 'description', + }), + } From 170605840ea9d5ad75da6576485ea7d125b428ee Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 27 Apr 2023 05:52:22 +0530 Subject: [PATCH 067/501] Populate `filename` and `urls` fields at all stages of `--print` Closes https://github.com/yt-dlp/yt-dlp/issues/6920 --- yt_dlp/YoutubeDL.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index dce6cf928c..482b1a49e9 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1677,7 +1677,7 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) self._fill_common_fields(info_copy, False) - self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + self.__forced_printings(info_copy) self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) @@ -2719,7 +2719,7 @@ def is_wellformed(f): self.list_formats(info_dict) if list_only: # Without this printing, -F --print-json will not work - self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) + self.__forced_printings(info_dict) return info_dict format_selector = self.format_selector @@ -2879,6 +2879,12 @@ def _forceprint(self, key, info_dict): if info_dict is None: return info_copy = info_dict.copy() + info_copy.setdefault('filename', self.prepare_filename(info_dict)) + if info_dict.get('requested_formats') is not None: + # For RTMP URLs, also include the playpath + info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) + elif info_dict.get('url'): + info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '') info_copy['formats_table'] = self.render_formats_table(info_dict) info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict) info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles')) @@ -2907,7 +2913,9 @@ def format_tmpl(tmpl): with open(filename, 'a', encoding='utf-8', newline='') as f: f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep) - def __forced_printings(self, info_dict, filename, incomplete): + return info_copy + + def __forced_printings(self, info_dict, filename=None, incomplete=True): def print_mandatory(field, actual_field=None): if actual_field is None: actual_field = field @@ -2920,20 +2928,14 @@ def print_optional(field): and info_dict.get(field) is not None): self.to_stdout(info_dict[field]) - info_dict = info_dict.copy() - if filename is not None: - info_dict['filename'] = filename - if info_dict.get('requested_formats') is not None: - # For RTMP URLs, also include the playpath - info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) - elif info_dict.get('url'): - info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '') - if (self.params.get('forcejson') or self.params['forceprint'].get('video') or self.params['print_to_file'].get('video')): self.post_extract(info_dict) - self._forceprint('video', info_dict) + + if filename: + info_dict['filename'] = filename + info_dict = self._forceprint('video', info_dict) print_mandatory('title') print_mandatory('id') @@ -3493,10 +3495,10 @@ def run_pp(self, pp, infodict): return infodict def run_all_pps(self, key, info, *, additional_pps=None): - if key != 'video': - self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: info = self.run_pp(pp, info) + if key != 'video': + self._forceprint(key, info) return info def pre_process(self, ie_info, key='pre_process', files_to_move=None): From 7cf51f21916292cd80bdeceb37489f5322f166dd Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 27 Apr 2023 07:42:17 +0530 Subject: [PATCH 068/501] [jsinterp] Handle negative numbers better Closes #6131 --- test/test_jsinterp.py | 16 ++++++++++++++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 8 +++++--- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index e090dc7914..3283657d70 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -445,6 +445,22 @@ def test_bitwise_operators_overflow(self): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_negative(self): + jsi = JSInterpreter("function f(){return 2 * -2.0;}") + self.assertEqual(jsi.call_function('f'), -4) + + jsi = JSInterpreter('function f(){return 2 - - -2;}') + self.assertEqual(jsi.call_function('f'), 0) + + jsi = JSInterpreter('function f(){return 2 - - - -2;}') + self.assertEqual(jsi.call_function('f'), 4) + + jsi = JSInterpreter('function f(){return 2 - + + - -2;}') + self.assertEqual(jsi.call_function('f'), 0) + + jsi = JSInterpreter('function f(){return 2 + - + - -2;}') + self.assertEqual(jsi.call_function('f'), 0) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 336e80291f..e2b3f0870d 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -142,6 +142,10 @@ 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', ), + ( + 'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js', + 'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index db65260091..5571ecfeb1 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -243,7 +243,7 @@ def _separate(expr, delim=',', max_split=None): return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 - in_quote, escaping, after_op, in_regex_char_group = None, False, True, False + in_quote, escaping, after_op, in_regex_char_group, in_unary_op = None, False, True, False, False for idx, char in enumerate(expr): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 @@ -258,9 +258,11 @@ def _separate(expr, delim=',', max_split=None): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op) + in_unary_op = (not in_quote and not in_regex_char_group + and after_op not in (True, False) and char in '-+') + after_op = char if (not in_quote and char in OP_CHARS) else (char.isspace() and after_op) - if char != delim[pos] or any(counters.values()) or in_quote: + if char != delim[pos] or any(counters.values()) or in_quote or in_unary_op: pos = 0 continue elif pos != delim_len: From b5f61b69d4561b81fc98c226b176f0c15493e688 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 27 Apr 2023 19:35:28 +0530 Subject: [PATCH 069/501] Fix bug in 170605840ea9d5ad75da6576485ea7d125b428ee and related refactor --- yt_dlp/YoutubeDL.py | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 482b1a49e9..a8b4a650e7 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2916,36 +2916,30 @@ def format_tmpl(tmpl): return info_copy def __forced_printings(self, info_dict, filename=None, incomplete=True): - def print_mandatory(field, actual_field=None): - if actual_field is None: - actual_field = field - if (self.params.get('force%s' % field, False) - and (not incomplete or info_dict.get(actual_field) is not None)): - self.to_stdout(info_dict[actual_field]) - - def print_optional(field): - if (self.params.get('force%s' % field, False) - and info_dict.get(field) is not None): - self.to_stdout(info_dict[field]) - if (self.params.get('forcejson') or self.params['forceprint'].get('video') or self.params['print_to_file'].get('video')): self.post_extract(info_dict) - if filename: info_dict['filename'] = filename - info_dict = self._forceprint('video', info_dict) + info_copy = self._forceprint('video', info_dict) - print_mandatory('title') - print_mandatory('id') - print_mandatory('url', 'urls') - print_optional('thumbnail') - print_optional('description') - print_optional('filename') - if self.params.get('forceduration') and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - print_mandatory('format') + def print_field(field, actual_field=None, optional=False): + if actual_field is None: + actual_field = field + if self.params.get(f'force{field}') and ( + info_copy.get(field) is not None or (not optional and not incomplete)): + self.to_stdout(info_copy[actual_field]) + + print_field('title') + print_field('id') + print_field('url', 'urls') + print_field('thumbnail', optional=True) + print_field('description', optional=True) + print_field('filename', optional=True) + if self.params.get('forceduration') and info_copy.get('duration') is not None: + self.to_stdout(formatSeconds(info_copy['duration'])) + print_field('format') if self.params.get('forcejson'): self.to_stdout(json.dumps(self.sanitize_info(info_dict))) From 7a7b1376fbce0067cf37566bb47131bc0022638d Mon Sep 17 00:00:00 2001 From: makeworld <25111343+makew0rld@users.noreply.github.com> Date: Thu, 27 Apr 2023 22:42:25 -0400 Subject: [PATCH 070/501] [extractor/cbc] Fix live extractor, playlist `_VALID_URL` (#6625) Authored by: makew0rld --- yt_dlp/extractor/cbc.py | 120 +++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index eadb3f8c02..e42f062464 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -8,14 +8,16 @@ compat_str, ) from ..utils import ( + ExtractorError, int_or_none, join_nonempty, js_to_json, orderedSet, + parse_iso8601, smuggle_url, strip_or_none, + traverse_obj, try_get, - ExtractorError, ) @@ -404,7 +406,7 @@ def _real_extract(self, url): class CBCGemPlaylistIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:playlist' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' _TESTS = [{ # TV show playlist, all public videos 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', @@ -414,6 +416,9 @@ class CBCGemPlaylistIE(InfoExtractor): 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', }, + }, { + 'url': 'https://gem.cbc.ca/schitts-creek/s06', + 'only_matching': True, }] _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' @@ -473,49 +478,90 @@ def _real_extract(self, url): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)' - _TEST = { - 'url': 'https://gem.cbc.ca/live/920604739687', - 'info_dict': { - 'title': 'Ottawa', - 'description': 'The live TV channel and local programming from Ottawa', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, - 'id': 'AyqZwxRqh8EH', - 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', - 'uploader': 'CBCC-NEW', + _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', }, - 'skip': 'Live might have ended', - } - - # It's unclear where the chars at the end come from, but they appear to be - # constant. Might need updating in the future. - # There are two URLs, some livestreams are in one, and some - # in the other. The JSON schema is the same for both. - _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + { + 'url': 'https://gem.cbc.ca/live/44', + 'info_dict': { + 'id': '44', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^Ottawa [0-9\-: ]+', + 'description': 'The live TV channel and local programming from Ottawa', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*' + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { + 'url': 'https://gem.cbc.ca/live-event/10835', + 'info_dict': { + 'id': '10835', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+', + 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + 'timestamp': 1679706000, + 'upload_date': '20230325', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + } + ] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - for api_url in self._API_URLS: - video_info = next(( - stream for stream in self._download_json(api_url, video_id)['entries'] - if stream.get('guid') == video_id), None) - if video_info: - break - else: + # Two types of metadata JSON + if not video_info.get('formattedIdMedia'): + video_info = traverse_obj( + video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}), + get_all=False, default={}) + + video_stream_id = video_info.get('formattedIdMedia') + if not video_stream_id: raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'mpx', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': video_info['content'][0]['url'], 'id': video_id, - 'title': video_info.get('title'), - 'description': video_info.get('description'), - 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), - 'thumbnail': video_info.get('cbc$staticImage'), + 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True), 'is_live': True, + **traverse_obj(video_info, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('images', 'card', 'url'), + 'timestamp': ('airDate', {parse_iso8601}), + }) } From f005a35aa7e4f67a0c603a946c0dd714c151b2d6 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 29 Apr 2023 00:58:48 +0530 Subject: [PATCH 071/501] Ensure pre-processor errors do not block `--print` Closes #6937 --- yt_dlp/YoutubeDL.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a8b4a650e7..857b7ea374 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3489,10 +3489,12 @@ def run_pp(self, pp, infodict): return infodict def run_all_pps(self, key, info, *, additional_pps=None): - for pp in (additional_pps or []) + self._pps[key]: - info = self.run_pp(pp, info) - if key != 'video': - self._forceprint(key, info) + try: + for pp in (additional_pps or []) + self._pps[key]: + info = self.run_pp(pp, info) + finally: + if key != 'video': + self._forceprint(key, info) return info def pre_process(self, ie_info, key='pre_process', files_to_move=None): From 17ba4343cf99701692a7f4798fd42b50f644faba Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 29 Apr 2023 02:57:50 +0530 Subject: [PATCH 072/501] Fix f005a35aa7e4f67a0c603a946c0dd714c151b2d6 Printing inside `finally` causes the order of logging to change when there is an error, which is undesirable. So this is reverted. The issue of `--print` being blocked by pre-processors was an unintentional side-effect of changing the operation orders in 170605840ea9d5ad75da6576485ea7d125b428ee, and this is also partially reverted. --- yt_dlp/YoutubeDL.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 857b7ea374..8ee42b86a6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3488,13 +3488,11 @@ def run_pp(self, pp, infodict): *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict - def run_all_pps(self, key, info, *, additional_pps=None): - try: - for pp in (additional_pps or []) + self._pps[key]: - info = self.run_pp(pp, info) - finally: - if key != 'video': - self._forceprint(key, info) + def run_all_pps(self, key, info, *, additional_pps=None, fatal=True): + if key != 'video': + self._forceprint(key, info) + for pp in (additional_pps or []) + self._pps[key]: + info = self.run_pp(pp, info) return info def pre_process(self, ie_info, key='pre_process', files_to_move=None): From 4d9280c9c853733534dda60486fa949bcca36c9e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 29 Apr 2023 13:19:35 -0500 Subject: [PATCH 073/501] [extractor/reddit] Add login support (#6950) Closes #6949 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 3e458456c1..13615e82f9 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -8,11 +8,13 @@ traverse_obj, try_get, unescapeHTML, + urlencode_postdata, url_or_none, ) class RedditIE(InfoExtractor): + _NETRC_MACHINE = 'reddit' _VALID_URL = r'https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', @@ -176,6 +178,25 @@ class RedditIE(InfoExtractor): 'only_matching': True, }] + def _perform_login(self, username, password): + captcha = self._download_json( + 'https://www.reddit.com/api/requires_captcha/login.json', None, + 'Checking login requirement')['required'] + if captcha: + raise ExtractorError('Reddit is requiring captcha before login', expected=True) + login = self._download_json( + f'https://www.reddit.com/api/login/{username}', None, data=urlencode_postdata({ + 'op': 'login-main', + 'user': username, + 'passwd': password, + 'api_type': 'json', + }), note='Logging in', errnote='Login request failed') + errors = '; '.join(traverse_obj(login, ('json', 'errors', ..., 1))) + if errors: + raise ExtractorError(f'Unable to login, Reddit API says {errors}', expected=True) + elif not traverse_obj(login, ('json', 'data', 'cookie', {str})): + raise ExtractorError('Unable to login, no cookie was returned') + def _real_extract(self, url): host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') From b079c26f0af8085bccdadc72c61c8164ca5ab0f8 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sun, 30 Apr 2023 19:50:22 +0200 Subject: [PATCH 074/501] [utils] `traverse_obj`: More fixes (#6959) - Fix result when branching with `traverse_string` - Fix `slice` path on `dict`s - Fix tests and docstrings from 21b5ec86c2c37d10c5bb97edd7051d3aac16bb3e - Add `is_iterable_like` helper function Authored by: Grub4K --- test/test_utils.py | 21 +++++++++++++++++++-- yt_dlp/utils.py | 28 ++++++++++++++++++---------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index f2f3b8170a..e1bf6ac20f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2016,7 +2016,7 @@ def test_traverse_obj(self): msg='nested `...` queries should work') self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4), msg='`...` query result should be flattened') - self.assertEqual(traverse_obj(range(4), ...), list(range(4)), + self.assertEqual(traverse_obj(iter(range(4)), ...), list(range(4)), msg='`...` should accept iterables') # Test function as key @@ -2025,7 +2025,7 @@ def test_traverse_obj(self): msg='function as query key should perform a filter based on (key, value)') self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, msg='exceptions in the query function should be catched') - self.assertEqual(traverse_obj(range(4), lambda _, x: x % 2 == 0), [0, 2], + self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2], msg='function key should accept iterables') if __debug__: with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): @@ -2051,6 +2051,17 @@ def test_traverse_obj(self): with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): traverse_obj(_TEST_DATA, {str.upper, str}) + # Test `slice` as a key + _SLICE_DATA = [0, 1, 2, 3, 4] + self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None, + msg='slice on a dictionary should not throw') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1], + msg='slice key should apply slice to sequence') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2], + msg='slice key should apply slice to sequence') + self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2], + msg='slice key should apply slice to sequence') + # Test alternative paths self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', msg='multiple `paths` should be treated as alternative paths') @@ -2234,6 +2245,12 @@ def test_traverse_obj(self): self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), traverse_string=True), ['s', 'r'], msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, ...), traverse_string=True), [], + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, lambda x, y: True), traverse_string=True), [], + msg='branching should result in list if `traverse_string`') + self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [], + msg='branching should result in list if `traverse_string`') # Test is_user_input behavior _IS_USER_INPUT_DATA = {'range8': list(range(8))} diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index f69311462d..2f5e667204 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3273,8 +3273,14 @@ def multipart_encode(data, boundary=None): return out, content_type -def variadic(x, allowed_types=(str, bytes, dict)): - return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) +def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT): + if blocked_types is NO_DEFAULT: + blocked_types = (str, bytes, collections.abc.Mapping) + return isinstance(x, allowed_types) and not isinstance(x, blocked_types) + + +def variadic(x, allowed_types=NO_DEFAULT): + return x if is_iterable_like(x, blocked_types=allowed_types) else (x,) def dict_get(d, key_or_keys, default=None, skip_false_values=True): @@ -5467,7 +5473,7 @@ def traverse_obj( obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, casesense=True, is_user_input=False, traverse_string=False): """ - Safely traverse nested `dict`s and `Sequence`s + Safely traverse nested `dict`s and `Iterable`s >>> obj = [{}, {"key": "value"}] >>> traverse_obj(obj, (1, "key")) @@ -5475,7 +5481,7 @@ def traverse_obj( Each of the provided `paths` is tested and the first producing a valid result will be returned. The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Sequence` and `re.Match`. + Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. @@ -5492,7 +5498,7 @@ def traverse_obj( Read as: `[traverse_obj(obj, branch) for branch in branches]`. - `function`: Branch out and return values filtered by the function. Read as: `[value for key, value in obj if function(key, value)]`. - For `Sequence`s, `key` is the index of the value. + For `Iterable`s, `key` is the index of the value. For `re.Match`es, `key` is the group number (0 = full match) as well as additionally any group names, if given. - `dict` Transform the current object and return a matching dict. @@ -5540,7 +5546,9 @@ def apply_key(key, obj, is_last): result = None if obj is None and traverse_string: - pass + if key is ... or callable(key) or isinstance(key, slice): + branching = True + result = () elif key is None: result = obj @@ -5563,7 +5571,7 @@ def apply_key(key, obj, is_last): branching = True if isinstance(obj, collections.abc.Mapping): result = obj.values() - elif isinstance(obj, collections.abc.Iterable) and not isinstance(obj, (str, bytes)): + elif is_iterable_like(obj): result = obj elif isinstance(obj, re.Match): result = obj.groups() @@ -5577,7 +5585,7 @@ def apply_key(key, obj, is_last): branching = True if isinstance(obj, collections.abc.Mapping): iter_obj = obj.items() - elif isinstance(obj, collections.abc.Iterable) and not isinstance(obj, (str, bytes)): + elif is_iterable_like(obj): iter_obj = enumerate(obj) elif isinstance(obj, re.Match): iter_obj = itertools.chain( @@ -5601,7 +5609,7 @@ def apply_key(key, obj, is_last): } or None elif isinstance(obj, collections.abc.Mapping): - result = (obj.get(key) if casesense or (key in obj) else + result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else next((v for k, v in obj.items() if casefold(k) == key), None)) elif isinstance(obj, re.Match): @@ -5613,7 +5621,7 @@ def apply_key(key, obj, is_last): result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) elif isinstance(key, (int, slice)): - if isinstance(obj, collections.abc.Sequence) and not isinstance(obj, (str, bytes)): + if is_iterable_like(obj, collections.abc.Sequence): branching = isinstance(key, slice) with contextlib.suppress(IndexError): result = obj[key] From 147e62fc584c3ea6fdb09bb7a47905df68553a22 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 1 May 2023 18:55:28 -0500 Subject: [PATCH 075/501] [extractor/twitter] Default to GraphQL, handle auth errors (#6957) Closes #6763 Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/twitter.py | 132 ++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 47da19011f..c1f34235db 100644 --- a/README.md +++ b/README.md @@ -1833,7 +1833,7 @@ #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` #### twitter -* `force_graphql`: Force usage of the GraphQL API. By default it will only be used if login cookies are provided +* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 3f1899e962..d9a89c44b6 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,6 +1,5 @@ import json import re -import urllib.error from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE @@ -17,6 +16,7 @@ format_field, int_or_none, make_archive_id, + remove_end, str_or_none, strip_or_none, traverse_obj, @@ -32,11 +32,9 @@ class TwitterBaseIE(InfoExtractor): _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - _TOKENS = { - 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None, - 'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None, - } _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' + _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} + _guest_token = None def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -94,7 +92,7 @@ def is_logged_in(self): def _call_api(self, path, video_id, query={}, graphql=False): cookies = self._get_cookies(self._API_BASE) - headers = {} + headers = self._AUTH.copy() csrf_cookie = cookies.get('ct0') if csrf_cookie: @@ -107,54 +105,34 @@ def _call_api(self, path, video_id, query={}, graphql=False): 'x-twitter-active-user': 'yes', }) - last_error = None - for bearer_token in self._TOKENS: - for first_attempt in (True, False): - headers['Authorization'] = f'Bearer {bearer_token}' + for first_attempt in (True, False): + if not self.is_logged_in and not self._guest_token: + headers.pop('x-guest-token', None) + self._guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', video_id, + 'Downloading guest token', data=b'', headers=headers), 'guest_token') + if self._guest_token: + headers['x-guest-token'] = self._guest_token + elif not self.is_logged_in: + raise ExtractorError('Could not retrieve guest token') - if not self.is_logged_in: - if not self._TOKENS[bearer_token]: - headers.pop('x-guest-token', None) - guest_token_response = self._download_json( - self._API_BASE + 'guest/activate.json', video_id, - 'Downloading guest token', data=b'', headers=headers) + allowed_status = {400, 401, 403, 404} if graphql else {403} + result = self._download_json( + (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, + video_id, headers=headers, query=query, expected_status=allowed_status, + note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') - self._TOKENS[bearer_token] = guest_token_response.get('guest_token') - if not self._TOKENS[bearer_token]: - raise ExtractorError('Could not retrieve guest token') + if result.get('errors'): + errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) + if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower(): + self.to_screen('Guest token has expired. Refreshing guest token') + self._guest_token = None + continue - headers['x-guest-token'] = self._TOKENS[bearer_token] + raise ExtractorError( + f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) - try: - allowed_status = {400, 403, 404} if graphql else {403} - result = self._download_json( - (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, - video_id, headers=headers, query=query, expected_status=allowed_status) - - except ExtractorError as e: - if last_error: - raise last_error - - if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404: - raise - - last_error = e - self.report_warning( - 'Twitter API gave 404 response, retrying with deprecated auth token. ' - 'Only one media item can be extracted') - break # continue outer loop with next bearer_token - - if result.get('errors'): - errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str) - if first_attempt and any('bad guest token' in error.lower() for error in errors): - self.to_screen('Guest token has expired. Refreshing guest token') - self._TOKENS[bearer_token] = None - continue - - error_message = ', '.join(set(errors)) or 'Unknown error' - raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True) - - return result + return result def _build_graphql_query(self, media_id): raise NotImplementedError('Method must be implemented to support GraphQL') @@ -313,6 +291,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 18, }, @@ -391,6 +370,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, }, @@ -431,6 +411,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -480,6 +461,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, }, @@ -505,6 +487,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -529,6 +512,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -589,6 +573,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -630,12 +615,12 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, }, }, { - # Adult content, uses old token - # Fails if not logged in (GraphQL) + # Adult content, fails if not logged in (GraphQL) 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', 'info_dict': { 'id': '1575199163847000068', @@ -655,9 +640,8 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 18, 'tags': [] }, - 'expected_warnings': ['404'], + 'skip': 'Requires authentication', }, { - # Description is missing one https://t.co url (GraphQL) 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -669,14 +653,13 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20210519', 'age_limit': 0, 'repost_count': int, - 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7', + 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw', 'uploader_id': 'Srirachachau', 'comment_count': int, 'uploader_url': 'https://twitter.com/Srirachachau', 'timestamp': 1621447860, }, }, { - # Description is missing one https://t.co url (GraphQL) 'url': 'https://twitter.com/DavidToons_/status/1578353380363501568', 'playlist_mincount': 2, 'info_dict': { @@ -688,7 +671,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': str, 'timestamp': 1665143744, 'uploader_url': 'https://twitter.com/DavidToons_', - 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w', + 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/WgJauwIW1w', 'tags': [], 'comment_count': int, 'upload_date': '20221007', @@ -752,7 +735,7 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '1600649511827013632', 'ext': 'mp4', - 'title': 'md5:dac4f4d4c591fcc4e88a253eba472dc3', + 'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1', 'thumbnail': r're:^https?://.+\.jpg', 'timestamp': 1670459604.0, 'uploader_id': 'CTVJLaidlaw', @@ -764,6 +747,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, + 'view_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'upload_date': '20221208', 'age_limit': 0, @@ -791,6 +775,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'view_count': int, }, }, { 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -806,6 +791,7 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'duration': 9.531, 'comment_count': int, + 'view_count': int, 'upload_date': '20221203', 'age_limit': 0, 'timestamp': 1670092210.0, @@ -815,7 +801,6 @@ class TwitterIE(TwitterBaseIE): }, 'params': {'noplaylist': True}, }, { - # Media view count is GraphQL only, force in test 'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625', 'info_dict': { 'id': '1600009362759733248', @@ -826,10 +811,10 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün The Shinobi | BlaqBoi\'s Therapist', + 'uploader': 'Mün The Shinobi', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün The Shinobi | BlaqBoi\'s Therapist - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -837,9 +822,8 @@ class TwitterIE(TwitterBaseIE): 'duration': 139.987, 'timestamp': 1670306984.0, }, - 'params': {'extractor_args': {'twitter': {'force_graphql': ['']}}}, }, { - # url to retweet id + # url to retweet id, legacy API 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', @@ -860,6 +844,7 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'comment_count': int, }, + 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -905,11 +890,13 @@ def _graphql_to_legacy(self, data, twid): 'tweet_results', 'result', ('tweet', None), ), expected_type=dict, default={}, get_all=False) - if result.get('__typename') not in ('Tweet', None): + if result.get('__typename') not in ('Tweet', 'TweetTombstone', None): self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) if 'tombstone' in result: - cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str) + cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') + if cause and 'adult content' in cause: + self.raise_login_required(cause) raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) status = result.get('legacy', {}) @@ -922,7 +909,7 @@ def _graphql_to_legacy(self, data, twid): # extra transformation is needed since result does not match legacy format binding_values = { binding_value.get('key'): binding_value.get('value') - for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict) + for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict})) } if binding_values: status['card']['binding_values'] = binding_values @@ -965,12 +952,7 @@ def _build_graphql_query(self, media_id): def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') - if self.is_logged_in or self._configuration_arg('force_graphql'): - self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})') - result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) - status = self._graphql_to_legacy(result, twid) - - else: + if self._configuration_arg('legacy_api') and not self.is_logged_in: status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, @@ -978,6 +960,9 @@ def _real_extract(self, url): 'include_user_entities': 0, 'tweet_mode': 'extended', }), 'retweeted_status', None) + else: + result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) + status = self._graphql_to_legacy(result, twid) title = description = status['full_text'].replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames @@ -1142,7 +1127,8 @@ def get_binding_value(k): if not entries: expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none) if not expanded_url or expanded_url == url: - raise ExtractorError('No video could be found in this tweet', expected=True) + self.raise_no_formats('No video could be found in this tweet', expected=True) + return info return self.url_result(expanded_url, display_id=twid, **info) From b423b6a48e0b19260bc95ab7d72d2138d7f124dc Mon Sep 17 00:00:00 2001 From: Nicholas Defranco <39540565+nick-cd@users.noreply.github.com> Date: Mon, 1 May 2023 20:03:27 -0400 Subject: [PATCH 076/501] [extractor/dlf] Add extractors (#6697) Closes #6430 Authored by: nick-cd --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/dlf.py | 192 ++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 yt_dlp/extractor/dlf.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3b5ae63b1f..2d582f67f3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -452,6 +452,10 @@ ) from .democracynow import DemocracynowIE from .detik import DetikEmbedIE +from .dlf import ( + DLFIE, + DLFCorpusIE, +) from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE diff --git a/yt_dlp/extractor/dlf.py b/yt_dlp/extractor/dlf.py new file mode 100644 index 0000000000..88a4149b56 --- /dev/null +++ b/yt_dlp/extractor/dlf.py @@ -0,0 +1,192 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + traverse_obj, + url_or_none, +) + + +class DLFBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' + _BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + + def _parse_button_attrs(self, button, audio_id=None): + attrs = extract_attributes(button) + audio_id = audio_id or attrs['data-audio-diraid'] + + url = traverse_obj( + attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', + 'data-audio-src', expected_type=url_or_none) + ext = determine_ext(url) + + return { + 'id': audio_id, + 'extractor_key': DLFIE.ie_key(), + 'extractor': DLFIE.IE_NAME, + **traverse_obj(attrs, { + 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}), + 'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}), + 'thumbnail': ('data-audioimage', {url_or_none}), + 'uploader': 'data-audio-producer', + 'series': 'data-audio-series', + 'channel': 'data-audio-origin-site-name', + 'webpage_url': ('data-audio-download-tracking-path', {url_or_none}), + }, get_all=False), + 'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False) + if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]) + } + + +class DLFIE(DLFBaseIE): + IE_NAME = 'dlf' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html' + _TESTS = [ + # Audio as an HLS stream + { + 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', + 'info_dict': { + 'id': '03a3eb19', + 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', + 'ext': 'm4a', + 'duration': 3298, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'On Stage', + 'channel': 'deutschlandfunk' + }, + 'params': { + 'skip_download': 'm3u8' + }, + 'skip': 'This webpage no longer exists' + }, { + 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', + 'info_dict': { + 'id': 'd9cc1856', + 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', + 'ext': 'mp3', + 'duration': 291, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'Kommentare und Themen der Woche', + 'channel': 'deutschlandfunk' + } + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + return self._parse_button_attrs( + self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): + IE_NAME = 'dlf:corpus' + IE_DESC = 'DLF Multi-feed Archives' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' + _TESTS = [ + # Recorded news broadcast with referrals to related broadcasts + { + 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', + 'info_dict': { + 'id': 'fechten-russland-belarus-ukraine-protest-100', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad' + }, + 'playlist_mincount': 5, + 'playlist': [{ + 'info_dict': { + 'id': '1fc5d64a', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'ext': 'mp3', + 'duration': 252, + 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '2ada145f', + 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', + 'ext': 'mp3', + 'duration': 336, + 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', + 'uploader': 'Deutschlandfunk', + 'series': 'Deutschlandfunk Nova', + 'channel': 'deutschlandfunk-nova' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '47e1a096', + 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', + 'ext': 'mp3', + 'duration': 602, + 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }] + }, + # Podcast feed with tag buttons, playlist count fluctuates + { + 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', + 'info_dict': { + 'id': 'kommentare-und-themen-der-woche-100', + 'title': 'Meinung - Kommentare und Themen der Woche', + 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', + }, + 'playlist_mincount': 10, + }, + # Podcast feed with no description + { + 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', + 'info_dict': { + 'id': 'podcast-tolle-idee-100', + 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', + }, + 'playlist_mincount': 11, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage, default=None), + 'title': self._html_search_meta( + ['og:title', 'twitter:title'], webpage, default=None), + 'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), + } From 2f07c4c1da4361af213e5791279b9d152d2e4ce3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 3 May 2023 15:46:37 -0500 Subject: [PATCH 077/501] [extractor/clipchamp] Add extractor (#6978) Closes #6973 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/clipchamp.py | 61 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 yt_dlp/extractor/clipchamp.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2d582f67f3..974c8a2548 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -356,6 +356,7 @@ ) from .ciscowebex import CiscoWebexIE from .cjsw import CJSWIE +from .clipchamp import ClipchampIE from .cliphunter import CliphunterIE from .clippit import ClippitIE from .cliprs import ClipRsIE diff --git a/yt_dlp/extractor/clipchamp.py b/yt_dlp/extractor/clipchamp.py new file mode 100644 index 0000000000..a8bdf7e509 --- /dev/null +++ b/yt_dlp/extractor/clipchamp.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class ClipchampIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', + 'info_dict': { + 'id': 'gRXZ4ZhdDaU', + 'ext': 'mp4', + 'title': 'Untitled video', + 'uploader': 'Alexander Schwartz', + 'timestamp': 1680805580, + 'upload_date': '20230406', + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' + _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] + + storage_location = data.get('storage_location') + if storage_location != 'cf_stream': + raise ExtractorError(f'Unsupported clip storage location "{storage_location}"') + + path = data['download_url'] + iframe = self._download_webpage( + f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe') + subdomain = self._search_regex( + r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe, + 'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe' + + formats = self._extract_mpd_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, + query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') + formats.extend(self._extract_m3u8_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', + query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) + + return { + 'id': video_id, + 'formats': formats, + 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None, + **traverse_obj(data, { + 'title': ('project', 'project_name', {str}), + 'timestamp': ('created_at', {unified_timestamp}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } From 45998b3e371b819ce0dbe50da703809a048cc2fe Mon Sep 17 00:00:00 2001 From: Eveldee <eveldee0680@live.fr> Date: Fri, 5 May 2023 07:31:41 +0200 Subject: [PATCH 078/501] [utils] `locked_file`: Fix for virtiofs (#6840) Authored by: brandon-dacrib Closes #6823 --- yt_dlp/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 2f5e667204..47aa75c470 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2187,10 +2187,11 @@ def _lock_file(f, exclusive, block): fcntl.lockf(f, flags) def _unlock_file(f): - try: - fcntl.flock(f, fcntl.LOCK_UN) - except OSError: - fcntl.lockf(f, fcntl.LOCK_UN) + with contextlib.suppress(OSError): + return fcntl.flock(f, fcntl.LOCK_UN) + with contextlib.suppress(OSError): + return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock() + return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking except ImportError: From ddae33754ae1f32dd9c64cf895c47d20f6b5f336 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 5 May 2023 09:41:56 +0530 Subject: [PATCH 079/501] [extractor/youporn] Extract m3u8 formats Closes #6977 --- yt_dlp/extractor/youporn.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 8f1b9911b3..6ee0abcae0 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -6,6 +6,7 @@ int_or_none, merge_dicts, str_to_int, + traverse_obj, unified_strdate, url_or_none, ) @@ -86,32 +87,31 @@ class YouPornIE(InfoExtractor): }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') definitions = self._download_json( - 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, - display_id) + f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id) + + def get_format_data(data, f): + return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl'])) formats = [] - for definition in definitions: - if not isinstance(definition, dict): - continue - video_url = url_or_none(definition.get('videoUrl')) - if not video_url: - continue - f = { - 'url': video_url, - 'filesize': int_or_none(definition.get('videoSize')), - } + # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s + for hls_url in traverse_obj(get_format_data(definitions, 'hls'), ( + lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'), (..., 'videoUrl')): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) + + for definition in get_format_data(definitions, 'mp4'): + f = traverse_obj(definition, { + 'url': 'videoUrl', + 'filesize': ('videoSize', {int_or_none}) + }) height = int_or_none(definition.get('quality')) # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) + mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', definition['videoUrl']) if mobj: if not height: height = int(mobj.group('height')) @@ -179,6 +179,7 @@ def extract_tag_box(regex, title): 'tags') data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) + data.pop('url', None) return merge_dicts(data, { 'id': video_id, 'display_id': display_id, From 0c7ce146e4d2a84e656d78f6857952bfd25ab389 Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" <lauren@selfisekai.rocks> Date: Sat, 6 May 2023 02:09:49 +0200 Subject: [PATCH 080/501] [extractor/tvp] Use new API (#6989) Authored by: selfisekai Closes #6987 --- yt_dlp/extractor/tvp.py | 94 ++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index f8ded26463..2aa0dd870a 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -482,21 +482,34 @@ def _real_extract(self, url): class TVPVODBaseIE(InfoExtractor): _API_BASE_URL = 'https://vod.tvp.pl/api/products' - def _call_api(self, resource, video_id, **kwargs): - return self._download_json( + def _call_api(self, resource, video_id, query={}, **kwargs): + is_valid = lambda x: 200 <= x < 300 + document, urlh = self._download_json_handle( f'{self._API_BASE_URL}/{resource}', video_id, - query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs) + query={'lang': 'pl', 'platform': 'BROWSER', **query}, + expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs) + if is_valid(urlh.status): + return document + raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})') - def _parse_video(self, video): - return { - '_type': 'url', - 'url': 'tvp:' + video['externalUid'], - 'ie_key': TVPEmbedIE.ie_key(), - 'title': video.get('title'), - 'description': traverse_obj(video, ('lead', 'description')), - 'age_limit': int_or_none(video.get('rating')), - 'duration': int_or_none(video.get('duration')), - } + def _parse_video(self, video, with_url=True): + info_dict = traverse_obj(video, { + 'id': ('id', {str_or_none}), + 'title': 'title', + 'age_limit': ('rating', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'episode_number': ('number', {int_or_none}), + 'series': ('season', 'serial', 'title', {str_or_none}), + 'thumbnails': ('images', ..., ..., {'url': ('url', {url_or_none})}), + }) + info_dict['description'] = clean_html(dict_get(video, ('lead', 'description'))) + if with_url: + info_dict.update({ + '_type': 'url', + 'url': video['webUrl'], + 'ie_key': TVPVODVideoIE.ie_key(), + }) + return info_dict class TVPVODVideoIE(TVPVODBaseIE): @@ -506,37 +519,70 @@ class TVPVODVideoIE(TVPVODBaseIE): _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', 'info_dict': { - 'id': '60468609', + 'id': '311357', 'ext': 'mp4', - 'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24', + 'title': 'Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24', 'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c', 'duration': 300, 'episode_number': 24, 'episode': 'Episode 24', 'age_limit': 0, 'series': 'Laboratorium alchemika', - 'thumbnail': 're:https://.+', + 'thumbnail': 're:https?://.+', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667', 'info_dict': { - 'id': '51640077', + 'id': '339667', 'ext': 'mp4', - 'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu', - 'series': 'Ukraiński sługa narodu', + 'title': 'Ukraiński sługa narodu', 'description': 'md5:b7940c0a8e439b0c81653a986f544ef3', 'age_limit': 12, - 'episode': 'Episode 0', - 'episode_number': 0, 'duration': 3051, - 'thumbnail': 're:https://.+', + 'thumbnail': 're:https?://.+', + 'subtitles': 'count:2', }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'embed fails with "payment required"', + 'url': 'https://vod.tvp.pl/seriale,18/polowanie-na-cmy-odcinki,390116/odcinek-7,S01E07,398869', + 'info_dict': { + 'id': '398869', + 'ext': 'mp4', + 'title': 'odc. 7', + 'description': 'md5:dd2bb33f023dc5c2fbaddfbe4cb5dba0', + 'duration': 2750, + 'age_limit': 16, + 'series': 'Polowanie na ćmy', + 'episode_number': 7, + 'episode': 'Episode 7', + 'thumbnail': 're:https?://.+', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): video_id = self._match_id(url) - return self._parse_video(self._call_api(f'vods/{video_id}', video_id)) + info_dict = self._parse_video(self._call_api(f'vods/{video_id}', video_id), with_url=False) + + playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'}) + + info_dict['formats'] = [] + for manifest_url in traverse_obj(playlist, ('sources', 'HLS', ..., 'src')): + info_dict['formats'].extend(self._extract_m3u8_formats(manifest_url, video_id, fatal=False)) + for manifest_url in traverse_obj(playlist, ('sources', 'DASH', ..., 'src')): + info_dict['formats'].extend(self._extract_mpd_formats(manifest_url, video_id, fatal=False)) + + info_dict['subtitles'] = {} + for sub in playlist.get('subtitles') or []: + info_dict['subtitles'].setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['url'], + 'ext': 'ttml', + }) + + return info_dict class TVPVODSeriesIE(TVPVODBaseIE): @@ -551,7 +597,7 @@ class TVPVODSeriesIE(TVPVODBaseIE): 'age_limit': 12, 'categories': ['seriale'], }, - 'playlist_count': 129, + 'playlist_count': 130, }, { 'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514', 'only_matching': True, From c449c0655d7c8549e6e1389c26b628053b253d39 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Sat, 6 May 2023 18:14:40 +0900 Subject: [PATCH 081/501] [extractor/abematv] Add fallback for title and description extraction and extract more metadata (#6994) Authored by: Lesmiscore --- yt_dlp/extractor/abematv.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index f611c1f2c2..c9166b6b8c 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -436,6 +436,16 @@ def _real_extract(self, url): if 3 not in ondemand_types: # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') + info.update(traverse_obj(api_response, { + 'series': ('series', 'title'), + 'season': ('season', 'title'), + 'season_number': ('season', 'sequence'), + 'episode_number': ('episode', 'number'), + })) + if not title: + title = traverse_obj(api_response, ('episode', 'title')) + if not description: + description = traverse_obj(api_response, ('episode', 'content')) m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8' elif video_type == 'slots': From 3b52a606881e6adadc33444abdeacce562b79330 Mon Sep 17 00:00:00 2001 From: ringus1 <ringus1@users.noreply.github.com> Date: Tue, 9 May 2023 01:19:42 +0200 Subject: [PATCH 082/501] [extractor/facebook] Fix metadata extraction (#6856) Closes #3432 Authored by: ringus1 --- yt_dlp/extractor/facebook.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 1404be612e..9d871eb286 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -390,7 +390,10 @@ def extract_metadata(webpage): k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) or {}) page_title = title or self._html_search_regex(( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>', @@ -415,16 +418,17 @@ def extract_metadata(webpage): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None - view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) info_dict = { 'description': description, 'uploader': uploader, 'uploader_id': uploader_data.get('id'), 'timestamp': timestamp, 'thumbnail': thumbnail, - 'view_count': view_count, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) From ef8fb7f029b816dfc95600727d84400591a3b5c5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 8 May 2023 18:45:31 -0500 Subject: [PATCH 083/501] [extractor/wrestleuniverse] Fix extraction, add login (#6982) Closes #6975 Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki <contact@grub4k.xyz> --- README.md | 3 + yt_dlp/extractor/wrestleuniverse.py | 137 +++++++++++++++++++++------- 2 files changed, 105 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index c1f34235db..993ac5a5f6 100644 --- a/README.md +++ b/README.md @@ -1835,6 +1835,9 @@ #### rokfinchannel #### twitter * `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed +### wrestleuniverse +* `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage + **Note**: These options may be changed/removed in the future without concern for backward compatibility <!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE --> diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 5c6dec2c40..946edf20a4 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -2,6 +2,7 @@ import binascii import json import time +import uuid from .common import InfoExtractor from ..dependencies import Cryptodome @@ -12,30 +13,95 @@ traverse_obj, try_call, url_or_none, + urlencode_postdata, ) class WrestleUniverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'wrestleuniverse' _VALID_URL_TMPL = r'https?://(?:www\.)?wrestle-universe\.com/(?:(?P<lang>\w{2})/)?%s/(?P<id>\w+)' _API_PATH = None - _TOKEN = None + _REAL_TOKEN = None _TOKEN_EXPIRY = None + _REFRESH_TOKEN = None + _DEVICE_ID = None + _LOGIN_QUERY = {'key': 'AIzaSyCaRPBsDQYVDUWWBXjsTrHESi2r_F3RAdA'} + _LOGIN_HEADERS = { + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web', + 'X-Firebase-gmpid': '1:307308870738:web:820f38fe5150c8976e338b', + 'Referer': 'https://www.wrestle-universe.com/', + 'Origin': 'https://www.wrestle-universe.com', + } - def _get_token_cookie(self): - if not self._TOKEN or not self._TOKEN_EXPIRY: - self._TOKEN = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) - if not self._TOKEN: + @property + def _TOKEN(self): + if not self._REAL_TOKEN or not self._TOKEN_EXPIRY: + token = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) + if not token and not self._REFRESH_TOKEN: self.raise_login_required() - expiry = traverse_obj(jwt_decode_hs256(self._TOKEN), ('exp', {int_or_none})) - if not expiry: - raise ExtractorError('There was a problem with the token cookie') - self._TOKEN_EXPIRY = expiry + self._REAL_TOKEN = token - if self._TOKEN_EXPIRY <= int(time.time()): - raise ExtractorError( - 'Expired token. Refresh your cookies in browser and try again', expected=True) + if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()): + if not self._REFRESH_TOKEN: + raise ExtractorError( + 'Expired token. Refresh your cookies in browser and try again', expected=True) + self._refresh_token() - return self._TOKEN + return self._REAL_TOKEN + + @_TOKEN.setter + def _TOKEN(self, value): + self._REAL_TOKEN = value + + expiry = traverse_obj(value, ({jwt_decode_hs256}, 'exp', {int_or_none})) + if not expiry: + raise ExtractorError('There was a problem with the auth token') + self._TOKEN_EXPIRY = expiry + + def _perform_login(self, username, password): + login = self._download_json( + 'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword', None, + 'Logging in', query=self._LOGIN_QUERY, headers=self._LOGIN_HEADERS, data=json.dumps({ + 'returnSecureToken': True, + 'email': username, + 'password': password, + }, separators=(',', ':')).encode()) + self._REFRESH_TOKEN = traverse_obj(login, ('refreshToken', {str})) + if not self._REFRESH_TOKEN: + self.report_warning('No refresh token was granted') + self._TOKEN = traverse_obj(login, ('idToken', {str})) + + def _real_initialize(self): + if WrestleUniverseBaseIE._DEVICE_ID: + return + + WrestleUniverseBaseIE._DEVICE_ID = self._configuration_arg('device_id', [None], ie_key='WrestleUniverse')[0] + if not WrestleUniverseBaseIE._DEVICE_ID: + WrestleUniverseBaseIE._DEVICE_ID = self.cache.load(self._NETRC_MACHINE, 'device_id') + if WrestleUniverseBaseIE._DEVICE_ID: + return + WrestleUniverseBaseIE._DEVICE_ID = str(uuid.uuid4()) + + self.cache.store(self._NETRC_MACHINE, 'device_id', WrestleUniverseBaseIE._DEVICE_ID) + + def _refresh_token(self): + refresh = self._download_json( + 'https://securetoken.googleapis.com/v1/token', None, 'Refreshing token', + query=self._LOGIN_QUERY, data=urlencode_postdata({ + 'grant_type': 'refresh_token', + 'refresh_token': self._REFRESH_TOKEN, + }), headers={ + **self._LOGIN_HEADERS, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + if traverse_obj(refresh, ('refresh_token', {str})): + self._REFRESH_TOKEN = refresh['refresh_token'] + token = traverse_obj(refresh, 'access_token', 'id_token', expected_type=str) + if not token: + raise ExtractorError('No auth token returned from refresh request') + self._TOKEN = token def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={}, fatal=True): headers = {'CA-CID': ''} @@ -43,7 +109,7 @@ def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={ headers['Content-Type'] = 'application/json;charset=utf-8' data = json.dumps(data, separators=(',', ':')).encode() if auth: - headers['Authorization'] = f'Bearer {self._get_token_cookie()}' + headers['Authorization'] = f'Bearer {self._TOKEN}' return self._download_json( f'https://api.wrestle-universe.com/v1/{self._API_PATH}/{video_id}{param}', video_id, note=f'Downloading {msg} JSON', errnote=f'Failed to download {msg} JSON', @@ -65,7 +131,7 @@ def decrypt(data): token = base64.b64encode(private_key.public_key().export_key('DER')).decode() api_json = self._call_api(video_id, param, msg, data={ - # 'deviceId' (random uuid4 generated at login) is not required yet + 'deviceId': self._DEVICE_ID, 'token': token, **data, }, query=query, fatal=fatal) @@ -105,7 +171,7 @@ class WrestleUniverseVODIE(WrestleUniverseBaseIE): 'upload_date': '20230129', 'thumbnail': 'https://image.asset.wrestle-universe.com/8FjD67P8rZc446RBQs5RBN/8FjD67P8rZc446RBQs5RBN', 'chapters': 'count:7', - 'cast': 'count:18', + 'cast': 'count:21', }, 'params': { 'skip_download': 'm3u8', @@ -169,6 +235,7 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'No longer available', }, { 'note': 'unencrypted HLS', 'url': 'https://www.wrestle-universe.com/en/lives/wUG8hP5iApC63jbtQzhVVx', @@ -196,14 +263,17 @@ def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') metadata = self._download_metadata(url, video_id, lang, 'eventFallbackData') - info = traverse_obj(metadata, { - 'title': ('displayName', {str}), - 'description': ('description', {str}), - 'channel': ('labels', 'group', {str}), - 'location': ('labels', 'venue', {str}), - 'timestamp': ('startTime', {int_or_none}), - 'thumbnails': (('keyVisualUrl', 'alterKeyVisualUrl', 'heroKeyVisualUrl'), {'url': {url_or_none}}), - }) + info = { + 'id': video_id, + **traverse_obj(metadata, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'channel': ('labels', 'group', {str}), + 'location': ('labels', 'venue', {str}), + 'timestamp': ('startTime', {int_or_none}), + 'thumbnails': (('keyVisualUrl', 'alterKeyVisualUrl', 'heroKeyVisualUrl'), {'url': {url_or_none}}), + }), + } ended_time = traverse_obj(metadata, ('endedTime', {int_or_none})) if info.get('timestamp') and ended_time: @@ -211,23 +281,20 @@ def _real_extract(self, url): video_data, decrypt = self._call_encrypted_api( video_id, ':watchArchive', 'watch archive', data={'method': 1}) - formats = self._get_formats(video_data, ( + info['formats'] = self._get_formats(video_data, ( ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) - for f in formats: + for f in info['formats']: # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values if f.get('tbr'): f['tbr'] = int(f['tbr'] / 2.5) hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) - if not hls_aes_key and traverse_obj(video_data, ('hls', 'encryptType', {int}), default=0) > 0: - self.report_warning('HLS AES-128 key was not found in API response') - - return { - 'id': video_id, - 'formats': formats, - 'hls_aes': { + if hls_aes_key: + info['hls_aes'] = { 'key': hls_aes_key, 'iv': traverse_obj(video_data, ('hls', 'iv', {decrypt})), }, - **info, - } + elif traverse_obj(video_data, ('hls', 'encryptType', {int})): + self.report_warning('HLS AES-128 key was not found in API response') + + return info From 21b9413cf7dd4830b2ece57af21589dd4538fc52 Mon Sep 17 00:00:00 2001 From: toomyzoom <52140413+toomyzoom@users.noreply.github.com> Date: Thu, 11 May 2023 02:48:35 -0700 Subject: [PATCH 084/501] [extractor/iwara] Implement login (#6721) Authored by: toomyzoom --- yt_dlp/extractor/iwara.py | 88 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index a5aad26ee8..bdc39a7ddb 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,6 +1,7 @@ import functools import urllib.parse import hashlib +import json from .common import InfoExtractor from ..utils import ( @@ -14,7 +15,49 @@ ) -class IwaraIE(InfoExtractor): +# https://github.com/yt-dlp/yt-dlp/issues/6671 +class IwaraBaseIE(InfoExtractor): + _USERTOKEN = None + _MEDIATOKEN = None + _NETRC_MACHINE = 'iwara' + + def _get_user_token(self, invalidate=False): + if not invalidate and self._USERTOKEN: + return self._USERTOKEN + + username, password = self._get_login_info() + IwaraBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username) + if not IwaraBaseIE._USERTOKEN or invalidate: + IwaraBaseIE._USERTOKEN = self._download_json( + 'https://api.iwara.tv/user/login', None, note='Logging in', + data=json.dumps({ + 'email': username, + 'password': password + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json' + })['token'] + + self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) + + return self._USERTOKEN + + def _get_media_token(self, invalidate=False): + if not invalidate and self._MEDIATOKEN: + return self._MEDIATOKEN + + IwaraBaseIE._MEDIATOKEN = self._download_json( + 'https://api.iwara.tv/user/token', None, note='Fetching media token', + data=b'', # Need to have some data here, even if it's empty + headers={ + 'Authorization': f'Bearer {self._get_user_token()}', + 'Content-Type': 'application/json' + })['accessToken'] + + return self._MEDIATOKEN + + +class IwaraIE(IwaraBaseIE): IE_NAME = 'iwara' _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ @@ -56,6 +99,26 @@ class IwaraIE(InfoExtractor): 'timestamp': 1678732213, 'modified_timestamp': 1679110271, }, + }, { + 'url': 'https://iwara.tv/video/blggmfno8ghl725bg', + 'info_dict': { + 'id': 'blggmfno8ghl725bg', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'お外でおしっこしちゃう猫耳ロリメイド', + 'description': 'md5:0342ba9bf6db09edbbb28729657c3611', + 'uploader': 'Fe_Kurosabi', + 'uploader_id': 'fekurosabi', + 'tags': [ + 'pee' + ], + 'like_count': 192, + 'view_count': 12119, + 'comment_count': 0, + 'timestamp': 1598880567, + 'modified_timestamp': 1598908995, + 'availability': 'needs_auth', + }, }] def _extract_formats(self, video_id, fileurl): @@ -79,12 +142,18 @@ def _extract_formats(self, video_id, fileurl): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True) + username, password = self._get_login_info() + headers = { + 'Authorization': f'Bearer {self._get_media_token()}', + } if username and password else None + video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True, headers=headers) errmsg = video_data.get('message') # at this point we can actually get uploaded user info, but do we need it? if errmsg == 'errors.privateVideo': self.raise_login_required('Private video. Login if you have permissions to watch') - elif errmsg: + elif errmsg == 'errors.notFound' and not username: + self.raise_login_required('Video may need login to view') + elif errmsg: # None if success raise ExtractorError(f'Iwara says: {errmsg}') if not video_data.get('fileUrl'): @@ -112,8 +181,17 @@ def _real_extract(self, url): 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))), } + def _perform_login(self, username, password): + if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token(): + self.write_debug('Skipping logging in') + return -class IwaraUserIE(InfoExtractor): + IwaraBaseIE._USERTOKEN = self._get_user_token(True) + self._get_media_token(True) + self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) + + +class IwaraUserIE(IwaraBaseIE): _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P<id>[^/?#&]+)' IE_NAME = 'iwara:user' _PER_PAGE = 32 @@ -165,7 +243,7 @@ def _real_extract(self, url): playlist_id, traverse_obj(user_info, ('user', 'name'))) -class IwaraPlaylistIE(InfoExtractor): +class IwaraPlaylistIE(IwaraBaseIE): # the ID is an UUID but I don't think it's necessary to write concrete regex _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P<id>[0-9a-f-]+)' IE_NAME = 'iwara:playlist' From c8bc203fbf3bb09914e53f0833eed622ab7edbb9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 20 May 2023 02:35:08 +0530 Subject: [PATCH 085/501] [docs] Misc improvements Closes #6814, closes #6940, closes #6733, closes #6923, closes #6566, closes #6726, closes #6728 --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 6 ++--- .github/ISSUE_TEMPLATE/4_bug_report.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml | 6 ++--- .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 6 +++++ CONTRIBUTING.md | 4 ++-- Collaborators.md | 8 +++---- README.md | 23 ++++++++++--------- yt_dlp/YoutubeDL.py | 5 ++-- yt_dlp/extractor/unsupported.py | 5 ++-- yt_dlp/options.py | 10 ++++---- 11 files changed, 43 insertions(+), 34 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index cdbb867603..77b777d5a9 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -1,5 +1,5 @@ -name: Broken site -description: Report error in a supported site +name: Broken site support +description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: - type: checkboxes @@ -16,7 +16,7 @@ body: description: | Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: options: - - label: I'm reporting that a **supported** site is broken + - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index bf1d97bbae..122dda4f26 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -1,4 +1,4 @@ -name: Bug report +name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index 1f6f926341..a51db789f3 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -1,5 +1,5 @@ -name: Broken site -description: Report error in a supported site +name: Broken site support +description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: %(no_skip)s @@ -10,7 +10,7 @@ body: description: | Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: options: - - label: I'm reporting that a **supported** site is broken + - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 90f59e70b0..9ab4902673 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -1,4 +1,4 @@ -name: Bug report +name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c4d3e812e2..cbed821734 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -40,4 +40,10 @@ ### What is the purpose of your *pull request*? - [ ] Core bug fix/improvement - [ ] New feature (It is strongly [recommended to open an issue first](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#adding-new-feature-or-making-overarching-changes)) + +<!-- Do NOT edit/remove anything below this! --> +</details><details><summary>Copilot Summary</summary> + +copilot:all + </details> diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ae2c454239..a8587fe92d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -79,7 +79,7 @@ ### Are you using the latest version? ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2021.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, subcribe to it to be notified when there is any progress. Unless you have something useful to add to the converation, please refrain from commenting. Additionally, it is also helpful to see if the issue has already been documented in the [youtube-dl issue tracker](https://github.com/ytdl-org/youtube-dl/issues). If similar issues have already been reported in youtube-dl (but not in our issue tracker), links to them can be included in your issue report here. @@ -246,7 +246,7 @@ ## yt-dlp coding conventions This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the the extractor will remain broken. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the extractor will remain broken. ### Mandatory and optional metafields diff --git a/Collaborators.md b/Collaborators.md index 71baf5080b..a0976dd8c5 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -8,7 +8,7 @@ # Collaborators ## [pukkandan](https://github.com/pukkandan) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/pukkandan) -[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) * Owner of the fork @@ -26,7 +26,7 @@ ## [shirt](https://github.com/shirt-dev) ## [coletdjnz](https://github.com/coletdjnz) -[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) * Improved plugin architecture * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements @@ -44,7 +44,7 @@ ## [Ashish0804](https://github.com/Ashish0804) <sub><sup>[Inactive]</sup></sub> * Improved/fixed support for HiDive, HotStar, Hungama, LBRY, LinkedInLearning, Mxplayer, SonyLiv, TV2, Vimeo, VLive etc -## [Lesmiscore](https://github.com/Lesmiscore) <sub><sup>(nao20010128nao)</sup></sub> +## [Lesmiscore](https://github.com/Lesmiscore) **Bitcoin**: bc1qfd02r007cutfdjwjmyy9w23rjvtls6ncve7r3s **Monacoin**: mona1q3tf7dzvshrhfe3md379xtvt2n22duhglv5dskr @@ -64,7 +64,7 @@ ## [bashonly](https://github.com/bashonly) ## [Grub4K](https://github.com/Grub4K) -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) +[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) * `--update-to`, automated release, nightly builds * Rework internals like `traverse_obj`, various core refactors and bugs fixes diff --git a/README.md b/README.md index 993ac5a5f6..6dff57b4c5 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ # NEW FEATURES * **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. * **YouTube improvements**: - * Supports Clips, Stories (`ytstories:<channel UCID>`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, YouTube Music Albums/Channels ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)), and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) + * Supports Clips, Stories (`ytstories:<channel UCID>`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) @@ -179,13 +179,13 @@ # INSTALLATION [![All versions](https://img.shields.io/badge/-All_Versions-lightgrey.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases) <!-- MANPAGE: END EXCLUDED SECTION --> -You can install yt-dlp using [the binaries](#release-files), [PIP](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions +You can install yt-dlp using [the binaries](#release-files), [pip](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions ## UPDATE You can use `yt-dlp -U` to update if you are using the [release binaries](#release-files) -If you [installed with PIP](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program +If you [installed with pip](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation#third-party-package-managers) or refer their documentation @@ -409,7 +409,8 @@ ## General Options: configuration files --flat-playlist Do not extract the videos of a playlist, only list them - --no-flat-playlist Extract the videos of a playlist + --no-flat-playlist Fully extract the videos of a playlist + (default) --live-from-start Download livestreams from the start. Currently only supported for YouTube (Experimental) @@ -465,9 +466,9 @@ ## Geo-restriction: downloading --xff VALUE How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. One of - "default" (Only when known to be useful), - "never", a two-letter ISO 3166-2 country - code, or an IP block in CIDR notation + "default" (only when known to be useful), + "never", an IP block in CIDR notation, or a + two-letter ISO 3166-2 country code ## Video Selection: -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items @@ -514,7 +515,7 @@ ## Video Selection: dogs" (caseless). Use "--match-filter -" to interactively ask whether to download each video - --no-match-filter Do not use any --match-filter (default) + --no-match-filters Do not use any --match-filter (default) --break-match-filters FILTER Same as "--match-filters" but stops the download process when a video is rejected --no-break-match-filters Do not use any --break-match-filters (default) @@ -1709,7 +1710,7 @@ # MODIFYING METADATA This option also has a few special uses: -* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)` will download the first vimeo video found in the description +* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)"` will download the first vimeo video found in the description * You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file - you can use this to set a different "description" and "synopsis". To modify the metadata of individual streams, use the `meta<n>_` prefix (e.g. `meta1_language`). Any value set to the `meta_` field will overwrite all default values. @@ -1883,7 +1884,7 @@ ## Installing Plugins * **System Plugins** * `/etc/yt-dlp/plugins/<package name>/yt_dlp_plugins/` * `/etc/yt-dlp-plugins/<package name>/yt_dlp_plugins/` -2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location: +2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location (recommended for portable installations): * Binary: where `<root-dir>/yt-dlp.exe`, `<root-dir>/yt-dlp-plugins/<package name>/yt_dlp_plugins/` * Source: where `<root-dir>/yt_dlp/__main__.py`, `<root-dir>/yt-dlp-plugins/<package name>/yt_dlp_plugins/` @@ -2071,7 +2072,7 @@ #### Use a custom format selector ```python import yt_dlp -URL = ['https://www.youtube.com/watch?v=BaW_jenozKc'] +URLS = ['https://www.youtube.com/watch?v=BaW_jenozKc'] def format_selector(ctx): """ Select the best video and the best audio that won't result in an mkv. diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8ee42b86a6..8f52a71a95 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -190,6 +190,7 @@ class YoutubeDL: ap_username: Multiple-system operator account username. ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. + netrc_location: Location of the netrc file. Defaults to ~/.netrc. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. @@ -3994,7 +3995,7 @@ def _write_subtitles(self, info_dict, filename): # that way it will silently go on when used with unsupporting IE return ret elif not subtitles: - self.to_screen('[info] There\'s no subtitles for the requested languages') + self.to_screen('[info] There are no subtitles for the requested languages') return ret sub_filename_base = self.prepare_filename(info_dict, 'subtitle') if not sub_filename_base: @@ -4048,7 +4049,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] if not thumbnails: - self.to_screen(f'[info] There\'s no {label} thumbnails to download') + self.to_screen(f'[info] There are no {label} thumbnails to download') return ret multiple = write_all and len(thumbnails) > 1 diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index a56bd284f9..1bc49786f9 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -131,8 +131,9 @@ class KnownPiracyIE(UnsupportedInfoExtractor): URLS = ( r'dood\.(?:to|watch|so|pm|wf|re)', # Sites youtube-dl supports, but we won't - r'https://viewsb\.com', - r'https://filemoon\.sx', + r'viewsb\.com', + r'filemoon\.sx', + r'hentai\.animestigma\.com', ) _TESTS = [{ diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 362a648cdd..dc46ce9984 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -411,7 +411,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): general.add_option( '--no-flat-playlist', action='store_false', dest='extract_flat', - help='Extract the videos of a playlist') + help='Fully extract the videos of a playlist (default)') general.add_option( '--live-from-start', action='store_true', dest='live_from_start', @@ -521,11 +521,11 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=optparse.SUPPRESS_HELP) geo.add_option( '--xff', metavar='VALUE', - dest='geo_bypass', default="default", + dest='geo_bypass', default='default', help=( 'How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. ' - 'One of "default" (Only when known to be useful), "never", ' - 'a two-letter ISO 3166-2 country code, or an IP block in CIDR notation')) + 'One of "default" (only when known to be useful), "never", ' + 'an IP block in CIDR notation, or a two-letter ISO 3166-2 country code')) geo.add_option( '--geo-bypass', action='store_const', dest='geo_bypass', const='default', @@ -617,7 +617,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'that contains the phrase "cats & dogs" (caseless). ' 'Use "--match-filter -" to interactively ask whether to download each video')) selection.add_option( - '--no-match-filter', + '--no-match-filters', dest='match_filter', action='store_const', const=None, help='Do not use any --match-filter (default)') selection.add_option( From f7f7a877bf8e87fd4eb0ad2494ad948ca7691114 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 20 May 2023 04:05:22 +0530 Subject: [PATCH 086/501] [extractor/booyah] Remove extractor Site shut down. Closes #6425 --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/booyah.py | 86 --------------------------------- 2 files changed, 87 deletions(-) delete mode 100644 yt_dlp/extractor/booyah.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 974c8a2548..fd2bfa9a10 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -247,7 +247,6 @@ from .bostonglobe import BostonGlobeIE from .box import BoxIE from .boxcast import BoxCastVideoIE -from .booyah import BooyahClipsIE from .bpb import BpbIE from .br import ( BRIE, diff --git a/yt_dlp/extractor/booyah.py b/yt_dlp/extractor/booyah.py deleted file mode 100644 index 5c55f2c765..0000000000 --- a/yt_dlp/extractor/booyah.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, str_or_none, traverse_obj - - -class BooyahBaseIE(InfoExtractor): - _BOOYAH_SESSION_KEY = None - - def _real_initialize(self): - BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( - 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') - - def _get_comments(self, video_id): - comment_json = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, - headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} - - return [{ - 'id': comment.get('comment_id'), - 'author': comment.get('from_nickname'), - 'author_id': comment.get('from_uid'), - 'author_thumbnail': comment.get('from_thumbnail'), - 'text': comment.get('content'), - 'timestamp': comment.get('create_time'), - 'like_count': comment.get('like_cnt'), - } for comment in comment_json.get('comment_list') or ()] - - -class BooyahClipsIE(BooyahBaseIE): - _VALID_URL = r'https?://booyah.live/clips/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://booyah.live/clips/13887261322952306617', - 'info_dict': { - 'id': '13887261322952306617', - 'ext': 'mp4', - 'view_count': int, - 'duration': 30, - 'channel_id': 90565760, - 'like_count': int, - 'title': 'Cayendo con estilo 😎', - 'uploader': '♡LɪꜱGΛ​MER​', - 'comment_count': int, - 'uploader_id': '90565760', - 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', - 'upload_date': '20220617', - 'timestamp': 1655490556, - 'modified_timestamp': 1655490556, - 'modified_date': '20220617', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, - headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) - - formats = [] - for video_data in json_data['playback']['endpoint_list']: - formats.extend(({ - 'url': video_data.get('stream_url'), - 'ext': 'mp4', - 'height': video_data.get('resolution'), - }, { - 'url': video_data.get('download_url'), - 'ext': 'mp4', - 'format_note': 'Watermarked', - 'height': video_data.get('resolution'), - 'preference': -10, - })) - - return { - 'id': video_id, - 'title': traverse_obj(json_data, ('playback', 'name')), - 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), - 'formats': formats, - 'view_count': traverse_obj(json_data, ('playback', 'views')), - 'like_count': traverse_obj(json_data, ('playback', 'likes')), - 'duration': traverse_obj(json_data, ('playback', 'duration')), - 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), - 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), - 'uploader': traverse_obj(json_data, ('user', 'nickname')), - 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), - 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), - 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), - '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), - } From 1d7656184c6b8aa46b29149893894b3c24f1df00 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 20 May 2023 02:57:59 +0530 Subject: [PATCH 087/501] [jsinterp] Handle `NaN` in bitwise operators Closes #6131 --- test/test_jsinterp.py | 10 ++++++++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 7 ++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 3283657d70..26711502a4 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -445,6 +445,16 @@ def test_bitwise_operators_overflow(self): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_bitwise_operators_typecast(self): + jsi = JSInterpreter('function x(){return null << 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return undefined >> 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return 42 << NaN}') + self.assertEqual(jsi.call_function('x'), 42) + def test_negative(self): jsi = JSInterpreter("function f(){return 2 * -2.0;}") self.assertEqual(jsi.call_function('f'), -4) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index e2b3f0870d..13120d97f8 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -146,6 +146,10 @@ 'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js', 'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw', ), + ( + 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', + 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 5571ecfeb1..965b1c0f29 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -20,7 +20,12 @@ def _js_bit_op(op): def zeroise(x): - return 0 if x in (None, JS_Undefined) else x + if x in (None, JS_Undefined): + return 0 + with contextlib.suppress(TypeError): + if math.isnan(x): # NB: NaN cannot be checked by membership + return 0 + return x def wrapped(a, b): return op(zeroise(a), zeroise(b)) & 0xffffffff From 6f2287cb18cbfb27518f068d868fa9390fee78ad Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 20 May 2023 03:06:23 +0530 Subject: [PATCH 088/501] [cleanup] Misc Closes #7030, closes #6967 --- test/helper.py | 4 +- test/test_YoutubeDL.py | 8 +- test/test_jsinterp.py | 558 +++++++++++++++-------------------------- yt_dlp/YoutubeDL.py | 65 ++--- yt_dlp/jsinterp.py | 2 +- yt_dlp/utils.py | 8 +- 6 files changed, 243 insertions(+), 402 deletions(-) diff --git a/test/helper.py b/test/helper.py index 0b90660ff6..539b2f6189 100644 --- a/test/helper.py +++ b/test/helper.py @@ -194,8 +194,8 @@ def sanitize_got_info_dict(got_dict): 'formats', 'thumbnails', 'subtitles', 'automatic_captions', 'comments', 'entries', # Auto-generated - 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch', - 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url', 'n_entries', + 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch', 'n_entries', + 'fulltitle', 'extractor', 'extractor_key', 'filename', 'filepath', 'infojson_filename', 'original_url', # Only live_status needs to be checked 'is_live', 'was_live', diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3c26bd7c65..477fd220ef 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -757,7 +757,7 @@ def expect_same_infodict(out): test('%(id)r %(height)r', "'1234' 1080") test('%(ext)s-%(ext|def)d', 'mp4-def') test('%(width|0)04d', '0000') - test('a%(width|)d', 'a', outtmpl_na_placeholder='none') + test('a%(width|b)d', 'ab', outtmpl_na_placeholder='none') FORMATS = self.outtmpl_info['formats'] sanitize = lambda x: x.replace(':', ':').replace('"', """).replace('\n', ' ') @@ -871,12 +871,12 @@ def test_postprocessors(self): class SimplePP(PostProcessor): def run(self, info): - with open(audiofile, 'wt') as f: + with open(audiofile, 'w') as f: f.write('EXAMPLE') return [info['filepath']], info def run_pp(params, PP): - with open(filename, 'wt') as f: + with open(filename, 'w') as f: f.write('EXAMPLE') ydl = YoutubeDL(params) ydl.add_post_processor(PP()) @@ -895,7 +895,7 @@ def run_pp(params, PP): class ModifierPP(PostProcessor): def run(self, info): - with open(info['filepath'], 'wt') as f: + with open(info['filepath'], 'w') as f: f.write('MODIFIED') return [], info diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 26711502a4..444909b84b 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -14,462 +14,302 @@ class TestJSInterpreter(unittest.TestCase): + def _test(self, code, ret, func='f', args=()): + self.assertEqual(JSInterpreter(code).call_function(func, *args), ret) + def test_basic(self): - jsi = JSInterpreter('function x(){;}') - self.assertEqual(jsi.call_function('x'), None) - - jsi = JSInterpreter('function x3(){return 42;}') - self.assertEqual(jsi.call_function('x3'), 42) - - jsi = JSInterpreter('function x3(){42}') - self.assertEqual(jsi.call_function('x3'), None) - - jsi = JSInterpreter('var x5 = function(){return 42;}') - self.assertEqual(jsi.call_function('x5'), 42) - - def test_calc(self): - jsi = JSInterpreter('function x4(a){return 2*a+1;}') - self.assertEqual(jsi.call_function('x4', 3), 7) - - def test_empty_return(self): - jsi = JSInterpreter('function f(){return; y()}') + jsi = JSInterpreter('function f(){;}') + self.assertEqual(repr(jsi.extract_function('f')), 'F<f>') self.assertEqual(jsi.call_function('f'), None) - def test_morespace(self): - jsi = JSInterpreter('function x (a) { return 2 * a + 1 ; }') - self.assertEqual(jsi.call_function('x', 3), 7) + self._test('function f(){return 42;}', 42) + self._test('function f(){42}', None) + self._test('var f = function(){return 42;}', 42) - jsi = JSInterpreter('function f () { x = 2 ; return x; }') - self.assertEqual(jsi.call_function('f'), 2) + def test_calc(self): + self._test('function f(a){return 2*a+1;}', 7, args=[3]) + + def test_empty_return(self): + self._test('function f(){return; y()}', None) + + def test_morespace(self): + self._test('function f (a) { return 2 * a + 1 ; }', 7, args=[3]) + self._test('function f () { x = 2 ; return x; }', 2) def test_strange_chars(self): - jsi = JSInterpreter('function $_xY1 ($_axY1) { var $_axY2 = $_axY1 + 1; return $_axY2; }') - self.assertEqual(jsi.call_function('$_xY1', 20), 21) + self._test('function $_xY1 ($_axY1) { var $_axY2 = $_axY1 + 1; return $_axY2; }', + 21, args=[20], func='$_xY1') def test_operators(self): - jsi = JSInterpreter('function f(){return 1 << 5;}') - self.assertEqual(jsi.call_function('f'), 32) - - jsi = JSInterpreter('function f(){return 2 ** 5}') - self.assertEqual(jsi.call_function('f'), 32) - - jsi = JSInterpreter('function f(){return 19 & 21;}') - self.assertEqual(jsi.call_function('f'), 17) - - jsi = JSInterpreter('function f(){return 11 >> 2;}') - self.assertEqual(jsi.call_function('f'), 2) - - jsi = JSInterpreter('function f(){return []? 2+3: 4;}') - self.assertEqual(jsi.call_function('f'), 5) - - jsi = JSInterpreter('function f(){return 1 == 2}') - self.assertEqual(jsi.call_function('f'), False) - - jsi = JSInterpreter('function f(){return 0 && 1 || 2;}') - self.assertEqual(jsi.call_function('f'), 2) - - jsi = JSInterpreter('function f(){return 0 ?? 42;}') - self.assertEqual(jsi.call_function('f'), 0) - - jsi = JSInterpreter('function f(){return "life, the universe and everything" < 42;}') - self.assertFalse(jsi.call_function('f')) + self._test('function f(){return 1 << 5;}', 32) + self._test('function f(){return 2 ** 5}', 32) + self._test('function f(){return 19 & 21;}', 17) + self._test('function f(){return 11 >> 2;}', 2) + self._test('function f(){return []? 2+3: 4;}', 5) + self._test('function f(){return 1 == 2}', False) + self._test('function f(){return 0 && 1 || 2;}', 2) + self._test('function f(){return 0 ?? 42;}', 0) + self._test('function f(){return "life, the universe and everything" < 42;}', False) def test_array_access(self): - jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') - self.assertEqual(jsi.call_function('f'), [5, 2, 7]) + self._test('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}', [5, 2, 7]) def test_parens(self): - jsi = JSInterpreter('function f(){return (1) + (2) * ((( (( (((((3)))))) )) ));}') - self.assertEqual(jsi.call_function('f'), 7) - - jsi = JSInterpreter('function f(){return (1 + 2) * 3;}') - self.assertEqual(jsi.call_function('f'), 9) + self._test('function f(){return (1) + (2) * ((( (( (((((3)))))) )) ));}', 7) + self._test('function f(){return (1 + 2) * 3;}', 9) def test_quotes(self): - jsi = JSInterpreter(R'function f(){return "a\"\\("}') - self.assertEqual(jsi.call_function('f'), R'a"\(') + self._test(R'function f(){return "a\"\\("}', R'a"\(') def test_assignments(self): - jsi = JSInterpreter('function f(){var x = 20; x = 30 + 1; return x;}') - self.assertEqual(jsi.call_function('f'), 31) - - jsi = JSInterpreter('function f(){var x = 20; x += 30 + 1; return x;}') - self.assertEqual(jsi.call_function('f'), 51) - - jsi = JSInterpreter('function f(){var x = 20; x -= 30 + 1; return x;}') - self.assertEqual(jsi.call_function('f'), -11) + self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31) + self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) + self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) def test_comments(self): 'Skipping: Not yet fully implemented' return - jsi = JSInterpreter(''' - function x() { - var x = /* 1 + */ 2; - var y = /* 30 - * 40 */ 50; - return x + y; - } - ''') - self.assertEqual(jsi.call_function('x'), 52) + self._test(''' + function f() { + var x = /* 1 + */ 2; + var y = /* 30 + * 40 */ 50; + return x + y; + } + ''', 52) - jsi = JSInterpreter(''' - function f() { - var x = "/*"; - var y = 1 /* comment */ + 2; - return y; - } - ''') - self.assertEqual(jsi.call_function('f'), 3) + self._test(''' + function f() { + var x = "/*"; + var y = 1 /* comment */ + 2; + return y; + } + ''', 3) def test_precedence(self): - jsi = JSInterpreter(''' - function x() { - var a = [10, 20, 30, 40, 50]; - var b = 6; - a[0]=a[b%a.length]; - return a; - }''') - self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) + self._test(''' + function f() { + var a = [10, 20, 30, 40, 50]; + var b = 6; + a[0]=a[b%a.length]; + return a; + } + ''', [20, 20, 30, 40, 50]) def test_builtins(self): - jsi = JSInterpreter(''' - function x() { return NaN } - ''') - self.assertTrue(math.isnan(jsi.call_function('x'))) + jsi = JSInterpreter('function f() { return NaN }') + self.assertTrue(math.isnan(jsi.call_function('f'))) - jsi = JSInterpreter(''' - function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } - ''') - self.assertEqual(jsi.call_function('x'), 86000) - jsi = JSInterpreter(''' - function x(dt) { return new Date(dt) - 0; } - ''') - self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', + 86000) + self._test('function f(dt) { return new Date(dt) - 0; }', + 86000, args=['Wednesday 31 December 1969 18:01:26 MDT']) def test_call(self): jsi = JSInterpreter(''' - function x() { return 2; } - function y(a) { return x() + (a?a:0); } - function z() { return y(3); } + function x() { return 2; } + function y(a) { return x() + (a?a:0); } + function z() { return y(3); } ''') self.assertEqual(jsi.call_function('z'), 5) self.assertEqual(jsi.call_function('y'), 2) def test_if(self): - jsi = JSInterpreter(''' - function x() { - let a = 9; - if (0==0) {a++} - return a - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + let a = 9; + if (0==0) {a++} + return a + } + ''', 10) - jsi = JSInterpreter(''' - function x() { - if (0==0) {return 10} - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + if (0==0) {return 10} + } + ''', 10) - jsi = JSInterpreter(''' - function x() { - if (0!=0) {return 1} - else {return 10} - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + if (0!=0) {return 1} + else {return 10} + } + ''', 10) """ # Unsupported - jsi = JSInterpreter(''' - function x() { - if (0!=0) {return 1} - else if (1==0) {return 2} - else {return 10} - }''') - self.assertEqual(jsi.call_function('x'), 10) + self._test(''' + function f() { + if (0!=0) {return 1} + else if (1==0) {return 2} + else {return 10} + } + ''', 10) """ def test_for_loop(self): - jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) {a++} return a } - ''') - self.assertEqual(jsi.call_function('x'), 10) + self._test('function f() { a=0; for (i=0; i-10; i++) {a++} return a }', 10) def test_switch(self): jsi = JSInterpreter(''' - function x(f) { switch(f){ - case 1:f+=1; - case 2:f+=2; - case 3:f+=3;break; - case 4:f+=4; - default:f=0; - } return f } + function f(x) { switch(x){ + case 1:x+=1; + case 2:x+=2; + case 3:x+=3;break; + case 4:x+=4; + default:x=0; + } return x } ''') - self.assertEqual(jsi.call_function('x', 1), 7) - self.assertEqual(jsi.call_function('x', 3), 6) - self.assertEqual(jsi.call_function('x', 5), 0) + self.assertEqual(jsi.call_function('f', 1), 7) + self.assertEqual(jsi.call_function('f', 3), 6) + self.assertEqual(jsi.call_function('f', 5), 0) def test_switch_default(self): jsi = JSInterpreter(''' - function x(f) { switch(f){ - case 2: f+=2; - default: f-=1; - case 5: - case 6: f+=6; - case 0: break; - case 1: f+=1; - } return f } + function f(x) { switch(x){ + case 2: x+=2; + default: x-=1; + case 5: + case 6: x+=6; + case 0: break; + case 1: x+=1; + } return x } ''') - self.assertEqual(jsi.call_function('x', 1), 2) - self.assertEqual(jsi.call_function('x', 5), 11) - self.assertEqual(jsi.call_function('x', 9), 14) + self.assertEqual(jsi.call_function('f', 1), 2) + self.assertEqual(jsi.call_function('f', 5), 11) + self.assertEqual(jsi.call_function('f', 9), 14) def test_try(self): - jsi = JSInterpreter(''' - function x() { try{return 10} catch(e){return 5} } - ''') - self.assertEqual(jsi.call_function('x'), 10) + self._test('function f() { try{return 10} catch(e){return 5} }', 10) def test_catch(self): - jsi = JSInterpreter(''' - function x() { try{throw 10} catch(e){return 5} } - ''') - self.assertEqual(jsi.call_function('x'), 5) + self._test('function f() { try{throw 10} catch(e){return 5} }', 5) def test_finally(self): - jsi = JSInterpreter(''' - function x() { try{throw 10} finally {return 42} } - ''') - self.assertEqual(jsi.call_function('x'), 42) - jsi = JSInterpreter(''' - function x() { try{throw 10} catch(e){return 5} finally {return 42} } - ''') - self.assertEqual(jsi.call_function('x'), 42) + self._test('function f() { try{throw 10} finally {return 42} }', 42) + self._test('function f() { try{throw 10} catch(e){return 5} finally {return 42} }', 42) def test_nested_try(self): - jsi = JSInterpreter(''' - function x() {try { - try{throw 10} finally {throw 42} - } catch(e){return 5} } - ''') - self.assertEqual(jsi.call_function('x'), 5) + self._test(''' + function f() {try { + try{throw 10} finally {throw 42} + } catch(e){return 5} } + ''', 5) def test_for_loop_continue(self): - jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } - ''') - self.assertEqual(jsi.call_function('x'), 0) + self._test('function f() { a=0; for (i=0; i-10; i++) { continue; a++ } return a }', 0) def test_for_loop_break(self): - jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { break; a++ } return a } - ''') - self.assertEqual(jsi.call_function('x'), 0) + self._test('function f() { a=0; for (i=0; i-10; i++) { break; a++ } return a }', 0) def test_for_loop_try(self): - jsi = JSInterpreter(''' - function x() { - for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; - return 42 } - ''') - self.assertEqual(jsi.call_function('x'), 42) + self._test(''' + function f() { + for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; + return 42 } + ''', 42) def test_literal_list(self): - jsi = JSInterpreter(''' - function x() { return [1, 2, "asdf", [5, 6, 7]][3] } - ''') - self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + self._test('function f() { return [1, 2, "asdf", [5, 6, 7]][3] }', [5, 6, 7]) def test_comma(self): - jsi = JSInterpreter(''' - function x() { a=5; a -= 1, a+=3; return a } - ''') - self.assertEqual(jsi.call_function('x'), 7) - - jsi = JSInterpreter(''' - function x() { a=5; return (a -= 1, a+=3, a); } - ''') - self.assertEqual(jsi.call_function('x'), 7) - - jsi = JSInterpreter(''' - function x() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) } - ''') - self.assertEqual(jsi.call_function('x'), 5) + self._test('function f() { a=5; a -= 1, a+=3; return a }', 7) + self._test('function f() { a=5; return (a -= 1, a+=3, a); }', 7) + self._test('function f() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) }', 5) def test_void(self): - jsi = JSInterpreter(''' - function x() { return void 42; } - ''') - self.assertEqual(jsi.call_function('x'), None) + self._test('function f() { return void 42; }', None) def test_return_function(self): jsi = JSInterpreter(''' - function x() { return [1, function(){return 1}][1] } + function f() { return [1, function(){return 1}][1] } ''') - self.assertEqual(jsi.call_function('x')([]), 1) + self.assertEqual(jsi.call_function('f')([]), 1) def test_null(self): - jsi = JSInterpreter(''' - function x() { return null; } - ''') - self.assertEqual(jsi.call_function('x'), None) - - jsi = JSInterpreter(''' - function x() { return [null > 0, null < 0, null == 0, null === 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, False, False]) - - jsi = JSInterpreter(''' - function x() { return [null >= 0, null <= 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [True, True]) + self._test('function f() { return null; }', None) + self._test('function f() { return [null > 0, null < 0, null == 0, null === 0]; }', + [False, False, False, False]) + self._test('function f() { return [null >= 0, null <= 0]; }', [True, True]) def test_undefined(self): - jsi = JSInterpreter(''' - function x() { return undefined === undefined; } - ''') - self.assertEqual(jsi.call_function('x'), True) + self._test('function f() { return undefined === undefined; }', True) + self._test('function f() { return undefined; }', JS_Undefined) + self._test('function f() {return undefined ?? 42; }', 42) + self._test('function f() { let v; return v; }', JS_Undefined) + self._test('function f() { let v; return v**0; }', 1) + self._test('function f() { let v; return [v>42, v<=42, v&&42, 42&&v]; }', + [False, False, JS_Undefined, JS_Undefined]) + + self._test(''' + function f() { return [ + undefined === undefined, + undefined == undefined, + undefined == null, + undefined < undefined, + undefined > undefined, + undefined === 0, + undefined == 0, + undefined < 0, + undefined > 0, + undefined >= 0, + undefined <= 0, + undefined > null, + undefined < null, + undefined === null + ]; } + ''', list(map(bool, (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))) jsi = JSInterpreter(''' - function x() { return undefined; } + function f() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) - - jsi = JSInterpreter(''' - function x() { let v; return v; } - ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) - - jsi = JSInterpreter(''' - function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } - ''') - self.assertEqual(jsi.call_function('x'), [True, True, False, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined === 0, undefined == 0, undefined < 0, undefined > 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, False, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined >= 0, undefined <= 0]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined > null, undefined < null, undefined == null, undefined === null]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, True, False]) - - jsi = JSInterpreter(''' - function x() { return [undefined === null, undefined == null, undefined < null, undefined > null]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, True, False, False]) - - jsi = JSInterpreter(''' - function x() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } - ''') - for y in jsi.call_function('x'): + for y in jsi.call_function('f'): self.assertTrue(math.isnan(y)) - jsi = JSInterpreter(''' - function x() { let v; return v**0; } - ''') - self.assertEqual(jsi.call_function('x'), 1) - - jsi = JSInterpreter(''' - function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } - ''') - self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined]) - - jsi = JSInterpreter('function x(){return undefined ?? 42; }') - self.assertEqual(jsi.call_function('x'), 42) - def test_object(self): - jsi = JSInterpreter(''' - function x() { return {}; } - ''') - self.assertEqual(jsi.call_function('x'), {}) - - jsi = JSInterpreter(''' - function x() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; } - ''') - self.assertEqual(jsi.call_function('x'), [42, 0]) - - jsi = JSInterpreter(''' - function x() { let a; return a?.qq; } - ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) - - jsi = JSInterpreter(''' - function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } - ''') - self.assertEqual(jsi.call_function('x'), JS_Undefined) + self._test('function f() { return {}; }', {}) + self._test('function f() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; }', [42, 0]) + self._test('function f() { let a; return a?.qq; }', JS_Undefined) + self._test('function f() { let a = {m1: 42, m2: 0 }; return a?.qq; }', JS_Undefined) def test_regex(self): - jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/; } - ''') - self.assertEqual(jsi.call_function('x'), None) + self._test('function f() { let a=/,,[/,913,/](,)}/; }', None) - jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/; return a; } - ''') - self.assertIsInstance(jsi.call_function('x'), re.Pattern) + jsi = JSInterpreter('function f() { let a=/,,[/,913,/](,)}/; return a; }') + self.assertIsInstance(jsi.call_function('f'), re.Pattern) - jsi = JSInterpreter(''' - function x() { let a=/,,[/,913,/](,)}/i; return a; } - ''') - self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + jsi = JSInterpreter('function f() { let a=/,,[/,913,/](,)}/i; return a; }') + self.assertEqual(jsi.call_function('f').flags & re.I, re.I) - jsi = JSInterpreter(R''' - function x() { let a=/,][}",],()}(\[)/; return a; } - ''') - self.assertEqual(jsi.call_function('x').pattern, r',][}",],()}(\[)') + jsi = JSInterpreter(R'function f() { let a=/,][}",],()}(\[)/; return a; }') + self.assertEqual(jsi.call_function('f').pattern, r',][}",],()}(\[)') - jsi = JSInterpreter(R''' - function x() { let a=[/[)\\]/]; return a[0]; } - ''') - self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + jsi = JSInterpreter(R'function f() { let a=[/[)\\]/]; return a[0]; }') + self.assertEqual(jsi.call_function('f').pattern, r'[)\\]') def test_char_code_at(self): - jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') - self.assertEqual(jsi.call_function('x', 0), 116) - self.assertEqual(jsi.call_function('x', 1), 101) - self.assertEqual(jsi.call_function('x', 2), 115) - self.assertEqual(jsi.call_function('x', 3), 116) - self.assertEqual(jsi.call_function('x', 4), None) - self.assertEqual(jsi.call_function('x', 'not_a_number'), 116) + jsi = JSInterpreter('function f(i){return "test".charCodeAt(i)}') + self.assertEqual(jsi.call_function('f', 0), 116) + self.assertEqual(jsi.call_function('f', 1), 101) + self.assertEqual(jsi.call_function('f', 2), 115) + self.assertEqual(jsi.call_function('f', 3), 116) + self.assertEqual(jsi.call_function('f', 4), None) + self.assertEqual(jsi.call_function('f', 'not_a_number'), 116) def test_bitwise_operators_overflow(self): - jsi = JSInterpreter('function x(){return -524999584 << 5}') - self.assertEqual(jsi.call_function('x'), 379882496) - - jsi = JSInterpreter('function x(){return 1236566549 << 5}') - self.assertEqual(jsi.call_function('x'), 915423904) + self._test('function f(){return -524999584 << 5}', 379882496) + self._test('function f(){return 1236566549 << 5}', 915423904) def test_bitwise_operators_typecast(self): - jsi = JSInterpreter('function x(){return null << 5}') - self.assertEqual(jsi.call_function('x'), 0) - - jsi = JSInterpreter('function x(){return undefined >> 5}') - self.assertEqual(jsi.call_function('x'), 0) - - jsi = JSInterpreter('function x(){return 42 << NaN}') - self.assertEqual(jsi.call_function('x'), 42) + self._test('function f(){return null << 5}', 0) + self._test('function f(){return undefined >> 5}', 0) + self._test('function f(){return 42 << NaN}', 42) def test_negative(self): - jsi = JSInterpreter("function f(){return 2 * -2.0;}") - self.assertEqual(jsi.call_function('f'), -4) - - jsi = JSInterpreter('function f(){return 2 - - -2;}') - self.assertEqual(jsi.call_function('f'), 0) - - jsi = JSInterpreter('function f(){return 2 - - - -2;}') - self.assertEqual(jsi.call_function('f'), 4) - - jsi = JSInterpreter('function f(){return 2 - + + - -2;}') - self.assertEqual(jsi.call_function('f'), 0) - - jsi = JSInterpreter('function f(){return 2 + - + - -2;}') - self.assertEqual(jsi.call_function('f'), 0) + self._test('function f(){return 2 * -2.0 ;}', -4) + self._test('function f(){return 2 - - -2 ;}', 0) + self._test('function f(){return 2 - - - -2 ;}', 4) + self._test('function f(){return 2 - + + - -2;}', 0) + self._test('function f(){return 2 + - + - -2;}', 0) if __name__ == '__main__': diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8f52a71a95..91aec1fe6e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -13,6 +13,7 @@ import random import re import shutil +import string import subprocess import sys import tempfile @@ -21,7 +22,6 @@ import traceback import unicodedata import urllib.request -from string import Formatter, ascii_letters from .cache import Cache from .compat import compat_os_name, compat_shlex_quote @@ -1079,7 +1079,7 @@ def _outtmpl_expandpath(outtmpl): # correspondingly that is not what we want since we need to keep # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. - sep = ''.join(random.choices(ascii_letters, k=32)) + sep = ''.join(random.choices(string.ascii_letters, k=32)) outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution @@ -1238,7 +1238,7 @@ def _dumpjson_default(obj): return list(obj) return repr(obj) - class _ReplacementFormatter(Formatter): + class _ReplacementFormatter(string.Formatter): def get_field(self, field_name, args, kwargs): if field_name.isdigit(): return args[0], -1 @@ -2068,86 +2068,86 @@ def syntax_error(note, start): def _parse_filter(tokens): filter_parts = [] - for type, string, start, _, _ in tokens: - if type == tokenize.OP and string == ']': + for type, string_, start, _, _ in tokens: + if type == tokenize.OP and string_ == ']': return ''.join(filter_parts) else: - filter_parts.append(string) + filter_parts.append(string_) def _remove_unused_ops(tokens): # Remove operators that we don't use and join them with the surrounding strings. # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None - for type, string, start, end, line in tokens: - if type == tokenize.OP and string == '[': + for type, string_, start, end, line in tokens: + if type == tokenize.OP and string_ == '[': if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line # everything inside brackets will be handled by _parse_filter - for type, string, start, end, line in tokens: - yield type, string, start, end, line - if type == tokenize.OP and string == ']': + for type, string_, start, end, line in tokens: + yield type, string_, start, end, line + if type == tokenize.OP and string_ == ']': break - elif type == tokenize.OP and string in ALLOWED_OPS: + elif type == tokenize.OP and string_ in ALLOWED_OPS: if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None - yield type, string, start, end, line + yield type, string_, start, end, line elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: if not last_string: - last_string = string + last_string = string_ last_start = start last_end = end else: - last_string += string + last_string += string_ if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None - for type, string, start, _, _ in tokens: + for type, string_, start, _, _ in tokens: # ENCODING is only defined in python 3.x if type == getattr(tokenize, 'ENCODING', None): continue elif type in [tokenize.NAME, tokenize.NUMBER]: - current_selector = FormatSelector(SINGLE, string, []) + current_selector = FormatSelector(SINGLE, string_, []) elif type == tokenize.OP: - if string == ')': + if string_ == ')': if not inside_group: # ')' will be handled by the parentheses group tokens.restore_last_token() break - elif inside_merge and string in ['/', ',']: + elif inside_merge and string_ in ['/', ',']: tokens.restore_last_token() break - elif inside_choice and string == ',': + elif inside_choice and string_ == ',': tokens.restore_last_token() break - elif string == ',': + elif string_ == ',': if not current_selector: raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None - elif string == '/': + elif string_ == '/': if not current_selector: raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) - elif string == '[': + elif string_ == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) - elif string == '(': + elif string_ == '(': if current_selector: raise syntax_error('Unexpected "("', start) group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) - elif string == '+': + elif string_ == '+': if not current_selector: raise syntax_error('Unexpected "+"', start) selector_1 = current_selector @@ -2156,7 +2156,7 @@ def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, ins raise syntax_error('Expected a selector', start) current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) else: - raise syntax_error(f'Operator not recognized: "{string}"', start) + raise syntax_error(f'Operator not recognized: "{string_}"', start) elif type == tokenize.ENDMARKER: break if current_selector: @@ -2898,7 +2898,7 @@ def format_tmpl(tmpl): fmt = '%({})s' if tmpl.startswith('{'): - tmpl = f'.{tmpl}' + tmpl, fmt = f'.{tmpl}', '%({})j' if tmpl.endswith('='): tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) @@ -2937,7 +2937,8 @@ def print_field(field, actual_field=None, optional=False): print_field('url', 'urls') print_field('thumbnail', optional=True) print_field('description', optional=True) - print_field('filename', optional=True) + if filename: + print_field('filename') if self.params.get('forceduration') and info_copy.get('duration') is not None: self.to_stdout(formatSeconds(info_copy['duration'])) print_field('format') @@ -3419,8 +3420,8 @@ def sanitize_info(info_dict, remove_private_keys=False): if remove_private_keys: reject = lambda k, v: v is None or k.startswith('__') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', - 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber', - '_format_sort_fields', + 'entries', 'filepath', '_filename', 'filename', 'infojson_filename', 'original_url', + 'playlist_autonumber', '_format_sort_fields', } else: reject = lambda k, v: False @@ -3489,7 +3490,7 @@ def run_pp(self, pp, infodict): *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict - def run_all_pps(self, key, info, *, additional_pps=None, fatal=True): + def run_all_pps(self, key, info, *, additional_pps=None): if key != 'video': self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 965b1c0f29..82974fb27b 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -248,7 +248,7 @@ def _separate(expr, delim=',', max_split=None): return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 - in_quote, escaping, after_op, in_regex_char_group, in_unary_op = None, False, True, False, False + in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 47aa75c470..190af1b7d7 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3281,7 +3281,7 @@ def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO def variadic(x, allowed_types=NO_DEFAULT): - return x if is_iterable_like(x, blocked_types=allowed_types) else (x,) + return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) def dict_get(d, key_or_keys, default=None, skip_false_values=True): @@ -5404,7 +5404,7 @@ def to_high_limit_path(path): def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traverse_obj(obj, *variadic(field)) - if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore): + if not val if ignore is NO_DEFAULT else val in variadic(ignore): return default return template % func(val) @@ -5704,8 +5704,8 @@ def traverse_dict(dictn, keys, casesense=True): return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) -def get_first(obj, keys, **kwargs): - return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) +def get_first(obj, *paths, **kwargs): + return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) def time_seconds(**kwargs): From 447afb9eaa65bc677e3245c83e53a8e69c174a3c Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 20 May 2023 19:11:03 +1200 Subject: [PATCH 089/501] [extractor/youtube] Support podcasts and releases tabs Closes https://github.com/yt-dlp/yt-dlp/issues/6893 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 48 ++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2b17751e5e..d089822f64 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4639,11 +4639,19 @@ def _playlist_entries(self, video_list_renderer): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( - rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} + rich_grid_renderer, + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} video_id = renderer.get('videoId') - if not video_id: + if video_id: + yield self._extract_video(renderer) + return + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=self._get_text(renderer, 'title')) return - yield self._extract_video(renderer) def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -6185,6 +6193,40 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': '3Blue1Brown', }, 'playlist_count': 0, + }, { + # Podcasts tab, with rich entry playlistRenderers + 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts', + 'info_dict': { + 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast', + 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c', + 'title': '99 Percent Invisible - Podcasts', + 'uploader': '99 Percent Invisible', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'tags': [], + 'channel': '99 Percent Invisible', + 'uploader_id': '@99percentinvisiblepodcast', + }, + 'playlist_count': 1, + }, { + # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + 'url': 'https://www.youtube.com/@AHimitsu/releases', + 'info_dict': { + 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'channel': 'A Himitsu', + 'uploader_url': 'https://www.youtube.com/@AHimitsu', + 'title': 'A Himitsu - Releases', + 'uploader_id': '@AHimitsu', + 'uploader': 'A Himitsu', + 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'tags': 'count:16', + 'description': 'I make music', + 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', + 'channel_follower_count': int, + }, + 'playlist_mincount': 10, }] @classmethod From d2e84d5eb01c66fc5304e8566348d65a7be24ed7 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Mon, 3 Apr 2023 07:01:03 +0200 Subject: [PATCH 090/501] [update] Better error handling Authored by: pukkandan --- yt_dlp/__init__.py | 21 +++++++++++++-------- yt_dlp/update.py | 7 ++++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 47ee3cc02f..8806106d31 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -13,6 +13,7 @@ import os import re import sys +import traceback from .compat import compat_shlex_quote from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS @@ -937,14 +938,18 @@ def _real_main(argv=None): if opts.rm_cachedir: ydl.cache.remove() - updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) - if opts.update_self and updater.update() and actual_use: - if updater.cmd: - return updater.restart() - # This code is reachable only for zip variant in py < 3.10 - # It makes sense to exit here, but the old behavior is to continue - ydl.report_warning('Restart yt-dlp to use the updated version') - # return 100, 'ERROR: The program must exit for the update to complete' + try: + updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart yt-dlp to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + except Exception: + traceback.print_exc() + ydl._download_retcode = 100 if not actual_use: if pre_process: diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 5a752d7167..7914de832f 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -16,6 +16,7 @@ Popen, cached_method, deprecation_warning, + network_exceptions, remove_end, remove_start, sanitized_Request, @@ -258,8 +259,8 @@ def check_update(self): self.ydl.to_screen(( f'Available version: {self._label(self.target_channel, self.latest_version)}, ' if self.target_tag == 'latest' else '' ) + f'Current version: {self._label(CHANNEL, self.current_version)}') - except Exception: - return self._report_network_error('obtain version info', delim='; Please try again later or') + except network_exceptions as e: + return self._report_network_error(f'obtain version info ({e})', delim='; Please try again later or') if not is_non_updateable(): self.ydl.to_screen(f'Current Build Hash: {_sha256_file(self.filename)}') @@ -303,7 +304,7 @@ def update(self): try: newcontent = self._download(self.release_name, self._tag) - except Exception as e: + except network_exceptions as e: if isinstance(e, urllib.error.HTTPError) and e.code == 404: return self._report_error( f'The requested tag {self._label(self.target_channel, self.target_tag)} does not exist', True) From 665472a7de3880578c0b7b3f95c71570c056368e Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sat, 20 May 2023 21:21:32 +0200 Subject: [PATCH 091/501] [update] Implement `--update-to` repo Authored by: Grub4K, pukkandan --- README.md | 13 ++++++++----- yt_dlp/__init__.py | 2 +- yt_dlp/options.py | 8 ++++---- yt_dlp/update.py | 41 ++++++++++++++++++++++++++++------------- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 6dff57b4c5..d0eaba7477 100644 --- a/README.md +++ b/README.md @@ -196,12 +196,15 @@ ## UPDATE The `nightly` channel has releases built after each push to the master branch, and will have the most recent fixes and additions, but also have more risk of regressions. They are available in [their own repo](https://github.com/yt-dlp/yt-dlp-nightly-builds/releases). When using `--update`/`-U`, a release binary will only update to its current channel. -This release channel can be changed by using the `--update-to` option. `--update-to` can also be used to upgrade or downgrade to specific tags from a channel. +`--update-to CHANNEL` can be used to switch to a different channel when a newer version is available. `--update-to [CHANNEL@]TAG` can also be used to upgrade or downgrade to specific tags from a channel. + +You may also use `--update-to <repository>` (`<owner>/<repository>`) to update to a channel on a completely different repository. Be careful with what repository you are updating to though, there is no verification done for binaries from different repositories. Example usage: * `yt-dlp --update-to nightly` change to `nightly` channel and update to its latest release * `yt-dlp --update-to stable@2023.02.17` upgrade/downgrade to release to `stable` channel tag `2023.02.17` * `yt-dlp --update-to 2023.01.06` upgrade/downgrade to tag `2023.01.06` if it exists on the current channel +* `yt-dlp --update-to example/yt-dlp@2023.03.01` upgrade/downgrade to the release from the `example/yt-dlp` repository, tag `2023.03.01` <!-- MANPAGE: BEGIN EXCLUDED SECTION --> ## RELEASE FILES @@ -360,10 +363,10 @@ ## General Options: -U, --update Update this program to the latest version --no-update Do not check for updates (default) --update-to [CHANNEL]@[TAG] Upgrade/downgrade to a specific version. - CHANNEL and TAG defaults to "stable" and - "latest" respectively if omitted; See - "UPDATE" for details. Supported channels: - stable, nightly + CHANNEL can be a repository as well. CHANNEL + and TAG default to "stable" and "latest" + respectively if omitted; See "UPDATE" for + details. Supported channels: stable, nightly -i, --ignore-errors Ignore download and postprocessing errors. The download will be considered successful even if the postprocessing fails diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 8806106d31..9563d784aa 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -939,7 +939,7 @@ def _real_main(argv=None): ydl.cache.remove() try: - updater = Updater(ydl, opts.update_self if isinstance(opts.update_self, str) else None) + updater = Updater(ydl, opts.update_self) if opts.update_self and updater.update() and actual_use: if updater.cmd: return updater.restart() diff --git a/yt_dlp/options.py b/yt_dlp/options.py index dc46ce9984..838d79fcb1 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -323,7 +323,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Print program version and exit') general.add_option( '-U', '--update', - action='store_true', dest='update_self', + action='store_const', dest='update_self', const=CHANNEL, help=format_field( is_non_updateable(), None, 'Check if updates are available. %s', default=f'Update this program to the latest {CHANNEL} version')) @@ -335,9 +335,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--update-to', action='store', dest='update_self', metavar='[CHANNEL]@[TAG]', help=( - 'Upgrade/downgrade to a specific version. CHANNEL and TAG defaults to ' - f'"{CHANNEL}" and "latest" respectively if omitted; See "UPDATE" for details. ' - f'Supported channels: {", ".join(UPDATE_SOURCES)}')) + 'Upgrade/downgrade to a specific version. CHANNEL can be a repository as well. ' + f'CHANNEL and TAG default to "{CHANNEL.partition("@")[0]}" and "latest" respectively if omitted; ' + f'See "UPDATE" for details. Supported channels: {", ".join(UPDATE_SOURCES)}')) general.add_option( '-i', '--ignore-errors', action='store_true', dest='ignoreerrors', diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 7914de832f..6c9bdaf1c7 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -129,27 +129,36 @@ def __init__(self, ydl, target=None): self.ydl = ydl self.target_channel, sep, self.target_tag = (target or CHANNEL).rpartition('@') - if not sep and self.target_tag in UPDATE_SOURCES: # stable => stable@latest - self.target_channel, self.target_tag = self.target_tag, None + # stable => stable@latest + if not sep and ('/' in self.target_tag or self.target_tag in UPDATE_SOURCES): + self.target_channel = self.target_tag + self.target_tag = None elif not self.target_channel: - self.target_channel = CHANNEL + self.target_channel = CHANNEL.partition('@')[0] if not self.target_tag: - self.target_tag, self._exact = 'latest', False + self.target_tag = 'latest' + self._exact = False elif self.target_tag != 'latest': self.target_tag = f'tags/{self.target_tag}' - @property - def _target_repo(self): - try: - return UPDATE_SOURCES[self.target_channel] - except KeyError: - return self._report_error( - f'Invalid update channel {self.target_channel!r} requested. ' - f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) + if '/' in self.target_channel: + self._target_repo = self.target_channel + if self.target_channel not in (CHANNEL, *UPDATE_SOURCES.values()): + self.ydl.report_warning( + f'You are switching to an {self.ydl._format_err("unofficial", "red")} executable ' + f'from {self.ydl._format_err(self._target_repo, self.ydl.Styles.EMPHASIS)}. ' + f'Run {self.ydl._format_err("at your own risk", "light red")}') + self.restart = self._blocked_restart + else: + self._target_repo = UPDATE_SOURCES.get(self.target_channel) + if not self._target_repo: + self._report_error( + f'Invalid update channel {self.target_channel!r} requested. ' + f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) def _version_compare(self, a, b, channel=CHANNEL): - if channel != self.target_channel: + if self._exact and channel != self.target_channel: return False if _VERSION_RE.fullmatch(f'{a}.{b}'): @@ -372,6 +381,12 @@ def restart(self): _, _, returncode = Popen.run(self.cmd) return returncode + def _blocked_restart(self): + self._report_error( + 'Automatically restarting into custom builds is disabled for security reasons. ' + 'Restart yt-dlp to use the updated version', expected=True) + return self.ydl._download_retcode + def run_update(ydl): """Update the program file with the latest version from the repository From 44a79958f0b596ee71e1eb25f158610aada29d1b Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Mon, 3 Apr 2023 07:06:27 +0200 Subject: [PATCH 092/501] [build] Fix macOS target Authored by: Grub4K --- .github/workflows/build.yml | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa11c61941..bec0576d1e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -188,21 +188,23 @@ jobs: steps: - uses: actions/checkout@v3 - # NB: In order to create a universal2 application, the version of python3 in /usr/bin has to be used + # NB: Building universal2 does not work with python from actions/setup-python - name: Install Requirements run: | brew install coreutils - /usr/bin/python3 -m pip install -U --user pip Pyinstaller==5.8 -r requirements.txt + python3 -m pip install -U --user pip setuptools wheel + # We need to ignore wheels otherwise we break universal2 builds + python3 -m pip install -U --user --no-binary :all: Pyinstaller -r requirements.txt - name: Prepare run: | - /usr/bin/python3 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} - /usr/bin/python3 devscripts/make_lazy_extractors.py + python3 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python3 devscripts/make_lazy_extractors.py - name: Build run: | - /usr/bin/python3 pyinst.py --target-architecture universal2 --onedir + python3 pyinst.py --target-architecture universal2 --onedir (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) - /usr/bin/python3 pyinst.py --target-architecture universal2 + python3 pyinst.py --target-architecture universal2 - name: Upload artifacts uses: actions/upload-artifact@v3 @@ -232,7 +234,8 @@ jobs: - name: Install Requirements run: | brew install coreutils - python3 -m pip install -U --user pip Pyinstaller -r requirements.txt + python3 -m pip install -U --user pip setuptools wheel + python3 -m pip install -U --user Pyinstaller -r requirements.txt - name: Prepare run: | From c4efa0aefec8daef1de62fd1693f13edf3c8b03c Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 20 May 2023 11:08:50 -0500 Subject: [PATCH 093/501] [build] Various build workflow improvements - Wait for build before publishing to PyPI - Do not run `meta_files` job if release is cancelled - Customizable channel in release workflow - Display badges above changelog Authored by: bashonly, Grub4K --- .github/workflows/build.yml | 4 +- .github/workflows/publish.yml | 46 +++++++++++------ .github/workflows/release-nightly.yml | 3 +- .github/workflows/release.yml | 72 ++++++++++++++++++++------- devscripts/update-version.py | 2 +- 5 files changed, 90 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bec0576d1e..d038e693d9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,7 +41,7 @@ on: required: true type: string channel: - description: Update channel (stable/nightly) + description: Update channel (stable/nightly/...) required: true default: stable type: string @@ -316,7 +316,7 @@ jobs: dist/yt-dlp_x86.exe meta_files: - if: inputs.meta_files && always() + if: inputs.meta_files && always() && !cancelled() needs: - unix - linux_arm diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 8a1bd9a010..3ca5c69924 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -2,16 +2,20 @@ name: Publish on: workflow_call: inputs: - nightly: - default: false - required: false - type: boolean + channel: + default: stable + required: true + type: string version: required: true type: string target_commitish: required: true type: string + prerelease: + default: false + required: true + type: boolean secrets: ARCHIVE_REPO_TOKEN: required: false @@ -34,6 +38,19 @@ jobs: - name: Generate release notes run: | + printf '%s' \ + '[![Installation](https://img.shields.io/badge/-Which%20file%20should%20I%20download%3F-white.svg?style=for-the-badge)]' \ + '(https://github.com/yt-dlp/yt-dlp#installation "Installation instructions") ' \ + '[![Documentation](https://img.shields.io/badge/-Docs-brightgreen.svg?style=for-the-badge&logo=GitBook&labelColor=555555)]' \ + '(https://github.com/yt-dlp/yt-dlp/tree/2023.03.04#readme "Documentation") ' \ + '[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)]' \ + '(https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators "Donate") ' \ + '[![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)]' \ + '(https://discord.gg/H5MNcFW63r "Discord") ' \ + ${{ inputs.channel != 'nightly' && '"[![Nightly](https://img.shields.io/badge/Get%20nightly%20builds-purple.svg?style=for-the-badge)]" \ + "(https://github.com/yt-dlp/yt-dlp-nightly-builds/releases/latest \"Nightly builds\")"' || '' }} \ + > ./RELEASE_NOTES + printf '\n\n' >> ./RELEASE_NOTES cat >> ./RELEASE_NOTES << EOF #### A description of the various files are in the [README](https://github.com/yt-dlp/yt-dlp#release-files) --- @@ -41,9 +58,9 @@ jobs: $(python ./devscripts/make_changelog.py -vv) </details> EOF - echo "**This is an automated nightly pre-release build**" >> ./PRERELEASE_NOTES - cat ./RELEASE_NOTES >> ./PRERELEASE_NOTES - echo "Generated from: https://github.com/${{ github.repository }}/commit/${{ inputs.target_commitish }}" >> ./ARCHIVE_NOTES + printf '%s\n\n' '**This is an automated nightly pre-release build**' >> ./NIGHTLY_NOTES + cat ./RELEASE_NOTES >> ./NIGHTLY_NOTES + printf '%s\n\n' 'Generated from: https://github.com/${{ github.repository }}/commit/${{ inputs.target_commitish }}' >> ./ARCHIVE_NOTES cat ./RELEASE_NOTES >> ./ARCHIVE_NOTES - name: Archive nightly release @@ -51,7 +68,7 @@ jobs: GH_TOKEN: ${{ secrets.ARCHIVE_REPO_TOKEN }} GH_REPO: ${{ vars.ARCHIVE_REPO }} if: | - inputs.nightly && env.GH_TOKEN != '' && env.GH_REPO != '' + inputs.channel == 'nightly' && env.GH_TOKEN != '' && env.GH_REPO != '' run: | gh release create \ --notes-file ARCHIVE_NOTES \ @@ -60,7 +77,7 @@ jobs: artifact/* - name: Prune old nightly release - if: inputs.nightly && !vars.ARCHIVE_REPO + if: inputs.channel == 'nightly' && !vars.ARCHIVE_REPO env: GH_TOKEN: ${{ github.token }} run: | @@ -68,14 +85,15 @@ jobs: git tag --delete "nightly" || true sleep 5 # Enough time to cover deletion race condition - - name: Publish release${{ inputs.nightly && ' (nightly)' || '' }} + - name: Publish release${{ inputs.channel == 'nightly' && ' (nightly)' || '' }} env: GH_TOKEN: ${{ github.token }} - if: (inputs.nightly && !vars.ARCHIVE_REPO) || !inputs.nightly + if: (inputs.channel == 'nightly' && !vars.ARCHIVE_REPO) || inputs.channel != 'nightly' run: | gh release create \ - --notes-file ${{ inputs.nightly && 'PRE' || '' }}RELEASE_NOTES \ + --notes-file ${{ inputs.channel == 'nightly' && 'NIGHTLY_NOTES' || 'RELEASE_NOTES' }} \ --target ${{ inputs.target_commitish }} \ - --title "yt-dlp ${{ inputs.nightly && 'nightly ' || '' }}${{ inputs.version }}" \ - ${{ inputs.nightly && '--prerelease "nightly"' || inputs.version }} \ + --title "yt-dlp ${{ inputs.channel == 'nightly' && 'nightly ' || '' }}${{ inputs.version }}" \ + ${{ inputs.prerelease && '--prerelease' || '' }} \ + ${{ inputs.channel == 'nightly' && '"nightly"' || inputs.version }} \ artifact/* diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index d4f01ab649..543e2e6f78 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -46,6 +46,7 @@ jobs: permissions: contents: write with: - nightly: true + channel: nightly + prerelease: true version: ${{ needs.prepare.outputs.version }} target_commitish: ${{ github.sha }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e07fc0c077..ada508be82 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,22 @@ name: Release -on: workflow_dispatch +on: + workflow_dispatch: + inputs: + version: + description: Version tag (YYYY.MM.DD[.REV]) + required: false + default: '' + type: string + channel: + description: Update channel (stable/nightly/...) + required: false + default: '' + type: string + prerelease: + description: Pre-release + default: false + type: boolean + permissions: contents: read @@ -9,8 +26,9 @@ jobs: contents: write runs-on: ubuntu-latest outputs: + channel: ${{ steps.set_channel.outputs.channel }} version: ${{ steps.update_version.outputs.version }} - head_sha: ${{ steps.push_release.outputs.head_sha }} + head_sha: ${{ steps.get_target.outputs.head_sha }} steps: - uses: actions/checkout@v3 @@ -21,10 +39,18 @@ jobs: with: python-version: "3.10" + - name: Set channel + id: set_channel + run: | + CHANNEL="${{ github.repository == 'yt-dlp/yt-dlp' && 'stable' || github.repository }}" + echo "channel=${{ inputs.channel || '$CHANNEL' }}" > "$GITHUB_OUTPUT" + - name: Update version id: update_version run: | - python devscripts/update-version.py ${{ vars.PUSH_VERSION_COMMIT == '' && '"$(date -u +"%H%M%S")"' || '' }} | \ + REVISION="${{ vars.PUSH_VERSION_COMMIT == '' && '$(date -u +"%H%M%S")' || '' }}" + REVISION="${{ inputs.prerelease && '$(date -u +"%H%M%S")' || '$REVISION' }}" + python devscripts/update-version.py ${{ inputs.version || '$REVISION' }} | \ grep -Po "version=\d+\.\d+\.\d+(\.\d+)?" >> "$GITHUB_OUTPUT" - name: Update documentation @@ -39,6 +65,7 @@ jobs: - name: Push to release id: push_release + if: ${{ !inputs.prerelease }} run: | git config --global user.name github-actions git config --global user.email github-actions@example.com @@ -46,14 +73,30 @@ jobs: git commit -m "Release ${{ steps.update_version.outputs.version }}" \ -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" git push origin --force ${{ github.event.ref }}:release + + - name: Get target commitish + id: get_target + run: | echo "head_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - name: Update master - if: vars.PUSH_VERSION_COMMIT != '' + if: vars.PUSH_VERSION_COMMIT != '' && !inputs.prerelease run: git push origin ${{ github.event.ref }} - publish_pypi_homebrew: + build: needs: prepare + uses: ./.github/workflows/build.yml + with: + version: ${{ needs.prepare.outputs.version }} + channel: ${{ needs.prepare.outputs.channel }} + permissions: + contents: read + packages: write # For package cache + secrets: + GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} + + publish_pypi_homebrew: + needs: [prepare, build] runs-on: ubuntu-latest steps: @@ -77,7 +120,7 @@ jobs: env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - if: env.TWINE_PASSWORD != '' + if: env.TWINE_PASSWORD != '' && !inputs.prerelease run: | rm -rf dist/* make pypi-files @@ -89,7 +132,7 @@ jobs: env: BREW_TOKEN: ${{ secrets.BREW_TOKEN }} PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' + if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' && !inputs.prerelease uses: actions/checkout@v3 with: repository: yt-dlp/homebrew-taps @@ -100,7 +143,7 @@ jobs: env: BREW_TOKEN: ${{ secrets.BREW_TOKEN }} PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' + if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' && !inputs.prerelease run: | python devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ needs.prepare.outputs.version }}" git -C taps/ config user.name github-actions @@ -108,22 +151,13 @@ jobs: git -C taps/ commit -am 'yt-dlp: ${{ needs.prepare.outputs.version }}' git -C taps/ push - build: - needs: prepare - uses: ./.github/workflows/build.yml - with: - version: ${{ needs.prepare.outputs.version }} - permissions: - contents: read - packages: write # For package cache - secrets: - GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} - publish: needs: [prepare, build] uses: ./.github/workflows/publish.yml permissions: contents: write with: + channel: ${{ needs.prepare.outputs.channel }} + prerelease: ${{ inputs.prerelease }} version: ${{ needs.prepare.outputs.version }} target_commitish: ${{ needs.prepare.outputs.head_sha }} diff --git a/devscripts/update-version.py b/devscripts/update-version.py index d888be8814..c873d10a5d 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -51,7 +51,7 @@ def get_git_head(): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Update the version.py file') parser.add_argument( - '-c', '--channel', choices=['stable', 'nightly'], default='stable', + '-c', '--channel', default='stable', help='Select update channel (default: %(default)s)') parser.add_argument( '-o', '--output', default='yt_dlp/version.py', From b73193c99aa23b135732408a5fcf655c68d731c6 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 20 May 2023 11:12:18 -0500 Subject: [PATCH 094/501] [build] Implement build verification using `--update-to` Authored by: bashonly, Grub4K --- .github/workflows/build.yml | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d038e693d9..ac0cfdf7cb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -127,6 +127,19 @@ jobs: mv ./dist/yt-dlp_linux ./yt-dlp_linux mv ./dist/yt-dlp_linux.zip ./yt-dlp_linux.zip + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + binaries=("yt-dlp" "yt-dlp_linux") + for binary in "${binaries[@]}"; do + chmod +x ./${binary} + cp ./${binary} ./${binary}_downgraded + version="$(./${binary} --version)" + ./${binary}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./${binary}_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + done + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -176,6 +189,16 @@ jobs: python3.8 devscripts/make_lazy_extractors.py python3.8 pyinst.py + if ${{ vars.UPDATE_TO_VERIFICATION && 'true' || 'false' }}; then + arch="${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}" + chmod +x ./dist/yt-dlp_linux_${arch} + cp ./dist/yt-dlp_linux_${arch} ./dist/yt-dlp_linux_${arch}_downgraded + version="$(./dist/yt-dlp_linux_${arch} --version)" + ./dist/yt-dlp_linux_${arch}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_linux_${arch}_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + fi + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -206,6 +229,16 @@ jobs: (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) python3 pyinst.py --target-architecture universal2 + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./dist/yt-dlp_macos + cp ./dist/yt-dlp_macos ./dist/yt-dlp_macos_downgraded + version="$(./dist/yt-dlp_macos --version)" + ./dist/yt-dlp_macos_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_macos_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -246,6 +279,16 @@ jobs: python3 pyinst.py mv dist/yt-dlp_macos dist/yt-dlp_macos_legacy + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./dist/yt-dlp_macos_legacy + cp ./dist/yt-dlp_macos_legacy ./dist/yt-dlp_macos_legacy_downgraded + version="$(./dist/yt-dlp_macos_legacy --version)" + ./dist/yt-dlp_macos_legacy_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_macos_legacy_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -278,6 +321,19 @@ jobs: python pyinst.py --onedir Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + foreach ($name in @("yt-dlp","yt-dlp_min")) { + Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" + $version = & "./dist/${name}.exe" --version + & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 + $downgraded_version = & "./dist/${name}_downgraded.exe" --version + if ($version -eq $downgraded_version) { + exit 1 + } + } + - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -309,6 +365,19 @@ jobs: run: | python pyinst.py + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + foreach ($name in @("yt-dlp_x86")) { + Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" + $version = & "./dist/${name}.exe" --version + & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 + $downgraded_version = & "./dist/${name}_downgraded.exe" --version + if ($version -eq $downgraded_version) { + exit 1 + } + } + - name: Upload artifacts uses: actions/upload-artifact@v3 with: From 23c39a4beadee382060bb47fdaa21316ca707d38 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Mon, 3 Apr 2023 07:22:11 +0200 Subject: [PATCH 095/501] [devscripts] `make_changelog`: Various improvements - Make single items collapse into one line - Don't hide "Important changes" in `<details>` - Move upstream merge into priority - Properly support comma separated prefixes Authored by: Grub4K --- .github/workflows/publish.yml | 4 +- devscripts/make_changelog.py | 187 +++++++++++++++++++--------------- 2 files changed, 106 insertions(+), 85 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3ca5c69924..9ebf54e7fc 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -54,9 +54,7 @@ jobs: cat >> ./RELEASE_NOTES << EOF #### A description of the various files are in the [README](https://github.com/yt-dlp/yt-dlp#release-files) --- - <details><summary><h3>Changelog</h3></summary> - $(python ./devscripts/make_changelog.py -vv) - </details> + $(python ./devscripts/make_changelog.py -vv --collapsible) EOF printf '%s\n\n' '**This is an automated nightly pre-release build**' >> ./NIGHTLY_NOTES cat ./RELEASE_NOTES >> ./NIGHTLY_NOTES diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index b159bc1b9b..1b7e251ee9 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -26,7 +26,6 @@ class CommitGroup(enum.Enum): - UPSTREAM = None PRIORITY = 'Important' CORE = 'Core' EXTRACTOR = 'Extractor' @@ -34,6 +33,11 @@ class CommitGroup(enum.Enum): POSTPROCESSOR = 'Postprocessor' MISC = 'Misc.' + @classmethod + @property + def ignorable_prefixes(cls): + return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream') + @classmethod @lru_cache def commit_lookup(cls): @@ -41,7 +45,6 @@ def commit_lookup(cls): name: group for group, names in { cls.PRIORITY: {''}, - cls.UPSTREAM: {'upstream'}, cls.CORE: { 'aes', 'cache', @@ -54,6 +57,7 @@ def commit_lookup(cls): 'outtmpl', 'plugins', 'update', + 'upstream', 'utils', }, cls.MISC: { @@ -111,22 +115,36 @@ def key(self): return ((self.details or '').lower(), self.sub_details, self.message) +def unique(items): + return sorted({item.strip().lower(): item for item in items if item}.values()) + + class Changelog: MISC_RE = re.compile(r'(?:^|\b)(?:lint(?:ing)?|misc|format(?:ting)?|fixes)(?:\b|$)', re.IGNORECASE) + ALWAYS_SHOWN = (CommitGroup.PRIORITY,) - def __init__(self, groups, repo): + def __init__(self, groups, repo, collapsible=False): self._groups = groups self._repo = repo + self._collapsible = collapsible def __str__(self): return '\n'.join(self._format_groups(self._groups)).replace('\t', ' ') def _format_groups(self, groups): + first = True for item in CommitGroup: + if self._collapsible and item not in self.ALWAYS_SHOWN and first: + first = False + yield '\n<details><summary><h3>Changelog</h3></summary>\n' + group = groups[item] if group: yield self.format_module(item.value, group) + if self._collapsible: + yield '\n</details>' + def format_module(self, name, group): result = f'\n#### {name} changes\n' if name else '\n' return result + '\n'.join(self._format_group(group)) @@ -137,62 +155,52 @@ def _format_group(self, group): for _, items in detail_groups: items = list(items) details = items[0].details - if not details: - indent = '' - else: - yield f'- {details}' - indent = '\t' if details == 'cleanup': - items, cleanup_misc_items = self._filter_cleanup_misc_items(items) + items = self._prepare_cleanup_misc_items(items) + + prefix = '-' + if details: + if len(items) == 1: + prefix = f'- **{details}**:' + else: + yield f'- **{details}**' + prefix = '\t-' sub_detail_groups = itertools.groupby(items, lambda item: tuple(map(str.lower, item.sub_details))) for sub_details, entries in sub_detail_groups: if not sub_details: for entry in entries: - yield f'{indent}- {self.format_single_change(entry)}' + yield f'{prefix} {self.format_single_change(entry)}' continue entries = list(entries) - prefix = f'{indent}- {", ".join(entries[0].sub_details)}' + sub_prefix = f'{prefix} {", ".join(entries[0].sub_details)}' if len(entries) == 1: - yield f'{prefix}: {self.format_single_change(entries[0])}' + yield f'{sub_prefix}: {self.format_single_change(entries[0])}' continue - yield prefix + yield sub_prefix for entry in entries: - yield f'{indent}\t- {self.format_single_change(entry)}' + yield f'\t{prefix} {self.format_single_change(entry)}' - if details == 'cleanup' and cleanup_misc_items: - yield from self._format_cleanup_misc_sub_group(cleanup_misc_items) - - def _filter_cleanup_misc_items(self, items): + def _prepare_cleanup_misc_items(self, items): cleanup_misc_items = defaultdict(list) - non_misc_items = [] + sorted_items = [] for item in items: if self.MISC_RE.search(item.message): cleanup_misc_items[tuple(item.commit.authors)].append(item) else: - non_misc_items.append(item) + sorted_items.append(item) - return non_misc_items, cleanup_misc_items + for commit_infos in cleanup_misc_items.values(): + sorted_items.append(CommitInfo( + 'cleanup', ('Miscellaneous',), ', '.join( + self._format_message_link(None, info.commit.hash) + for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), + [], Commit(None, '', commit_infos[0].commit.authors), [])) - def _format_cleanup_misc_sub_group(self, group): - prefix = '\t- Miscellaneous' - if len(group) == 1: - yield f'{prefix}: {next(self._format_cleanup_misc_items(group))}' - return - - yield prefix - for message in self._format_cleanup_misc_items(group): - yield f'\t\t- {message}' - - def _format_cleanup_misc_items(self, group): - for authors, infos in group.items(): - message = ', '.join( - self._format_message_link(None, info.commit.hash) - for info in sorted(infos, key=lambda item: item.commit.hash or '')) - yield f'{message} by {self._format_authors(authors)}' + return sorted_items def format_single_change(self, info): message = self._format_message_link(info.message, info.commit.hash) @@ -236,12 +244,8 @@ class CommitRange: AUTHOR_INDICATOR_RE = re.compile(r'Authored by:? ', re.IGNORECASE) MESSAGE_RE = re.compile(r''' - (?:\[ - (?P<prefix>[^\]\/:,]+) - (?:/(?P<details>[^\]:,]+))? - (?:[:,](?P<sub_details>[^\]]+))? - \]\ )? - (?:(?P<sub_details_alt>`?[^:`]+`?): )? + (?:\[(?P<prefix>[^\]]+)\]\ )? + (?:(?P<sub_details>`?[^:`]+`?): )? (?P<message>.+?) (?:\ \((?P<issues>\#\d+(?:,\ \#\d+)*)\))? ''', re.VERBOSE | re.DOTALL) @@ -340,60 +344,76 @@ def apply_overrides(self, overrides): self._commits = {key: value for key, value in reversed(self._commits.items())} def groups(self): - groups = defaultdict(list) + group_dict = defaultdict(list) for commit in self: - upstream_re = self.UPSTREAM_MERGE_RE.match(commit.short) + upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) if upstream_re: - commit.short = f'[upstream] Merge up to youtube-dl {upstream_re.group(1)}' + commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}' match = self.MESSAGE_RE.fullmatch(commit.short) if not match: logger.error(f'Error parsing short commit message: {commit.short!r}') continue - prefix, details, sub_details, sub_details_alt, message, issues = match.groups() - group = None - if prefix: - if prefix == 'priority': - prefix, _, details = (details or '').partition('/') - logger.debug(f'Priority: {message!r}') - group = CommitGroup.PRIORITY - - if not details and prefix: - if prefix not in ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream'): - logger.debug(f'Replaced details with {prefix!r}') - details = prefix or None - - if details == 'common': - details = None - - if details: - details = details.strip() - - else: - group = CommitGroup.CORE - - sub_details = f'{sub_details or ""},{sub_details_alt or ""}'.replace(':', ',') - sub_details = tuple(filter(None, map(str.strip, sub_details.split(',')))) - + prefix, sub_details_alt, message, issues = match.groups() issues = [issue.strip()[1:] for issue in issues.split(',')] if issues else [] + if prefix: + groups, details, sub_details = zip(*map(self.details_from_prefix, prefix.split(','))) + group = next(iter(filter(None, groups)), None) + details = ', '.join(unique(details)) + sub_details = list(itertools.chain.from_iterable(sub_details)) + else: + group = CommitGroup.CORE + details = None + sub_details = [] + + if sub_details_alt: + sub_details.append(sub_details_alt) + sub_details = tuple(unique(sub_details)) + if not group: - group = CommitGroup.get(prefix.lower()) - if not group: - if self.EXTRACTOR_INDICATOR_RE.search(commit.short): - group = CommitGroup.EXTRACTOR - else: - group = CommitGroup.POSTPROCESSOR - logger.warning(f'Failed to map {commit.short!r}, selected {group.name}') + if self.EXTRACTOR_INDICATOR_RE.search(commit.short): + group = CommitGroup.EXTRACTOR + else: + group = CommitGroup.POSTPROCESSOR + logger.warning(f'Failed to map {commit.short!r}, selected {group.name.lower()}') commit_info = CommitInfo( details, sub_details, message.strip(), issues, commit, self._fixes[commit.hash]) - logger.debug(f'Resolved {commit.short!r} to {commit_info!r}') - groups[group].append(commit_info) - return groups + logger.debug(f'Resolved {commit.short!r} to {commit_info!r}') + group_dict[group].append(commit_info) + + return group_dict + + @staticmethod + def details_from_prefix(prefix): + if not prefix: + return CommitGroup.CORE, None, () + + prefix, _, details = prefix.partition('/') + prefix = prefix.strip().lower() + details = details.strip() + + group = CommitGroup.get(prefix) + if group is CommitGroup.PRIORITY: + prefix, _, details = details.partition('/') + + if not details and prefix and prefix not in CommitGroup.ignorable_prefixes: + logger.debug(f'Replaced details with {prefix!r}') + details = prefix or None + + if details == 'common': + details = None + + if details: + details, *sub_details = details.split(':') + else: + sub_details = [] + + return group, details, sub_details def get_new_contributors(contributors_path, commits): @@ -444,6 +464,9 @@ def get_new_contributors(contributors_path, commits): parser.add_argument( '--repo', default='yt-dlp/yt-dlp', help='the github repository to use for the operations (default: %(default)s)') + parser.add_argument( + '--collapsible', action='store_true', + help='make changelog collapsible (default: %(default)s)') args = parser.parse_args() logging.basicConfig( @@ -467,4 +490,4 @@ def get_new_contributors(contributors_path, commits): write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') logger.info(f'New contributors: {", ".join(new_contributors)}') - print(Changelog(commits.groups(), args.repo)) + print(Changelog(commits.groups(), args.repo, args.collapsible)) From 69bec6730ec9d724bcedeab199d9d684d61423ba Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 21 May 2023 09:56:23 +1200 Subject: [PATCH 096/501] [cleanup, utils] Split into submodules (#7090) Closes https://github.com/yt-dlp/yt-dlp/pull/2173 Authored by: pukkandan, coletdjnz Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> --- Makefile | 2 +- setup.cfg | 1 + yt_dlp/YoutubeDL.py | 2 - yt_dlp/utils/__init__.py | 14 + yt_dlp/utils/_deprecated.py | 30 ++ yt_dlp/utils/_legacy.py | 163 ++++++++++ yt_dlp/{utils.py => utils/_utils.py} | 458 +-------------------------- yt_dlp/utils/traversal.py | 254 +++++++++++++++ 8 files changed, 480 insertions(+), 444 deletions(-) create mode 100644 yt_dlp/utils/__init__.py create mode 100644 yt_dlp/utils/_deprecated.py create mode 100644 yt_dlp/utils/_legacy.py rename yt_dlp/{utils.py => utils/_utils.py} (92%) create mode 100644 yt_dlp/utils/traversal.py diff --git a/Makefile b/Makefile index d5d47629b9..f03fe20523 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ offlinetest: codetest $(PYTHON) -m pytest -k "not download" # XXX: This is hard to maintain -CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/dependencies +CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/utils yt_dlp/dependencies yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip for d in $(CODE_FOLDERS) ; do \ diff --git a/setup.cfg b/setup.cfg index 6deaa79715..68d9e516d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,6 +8,7 @@ ignore = E402,E501,E731,E741,W503 max_line_length = 120 per_file_ignores = devscripts/lazy_load_template.py: F401 + yt_dlp/utils/__init__.py: F401, F403 [autoflake] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 91aec1fe6e..b8f1a05a09 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -124,7 +124,6 @@ parse_filesize, preferredencoding, prepend_extension, - register_socks_protocols, remove_terminal_sequences, render_table, replace_extension, @@ -739,7 +738,6 @@ def check_deprecated(param, option, suggestion): when=when) self._setup_opener() - register_socks_protocols() def preload_download_archive(fn): """Preload the archive, if any is specified""" diff --git a/yt_dlp/utils/__init__.py b/yt_dlp/utils/__init__.py new file mode 100644 index 0000000000..74b39e2c7b --- /dev/null +++ b/yt_dlp/utils/__init__.py @@ -0,0 +1,14 @@ +import warnings + +from ..compat.compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5)) +del passthrough_module + +# isort: off +from .traversal import * +from ._utils import * +from ._utils import _configuration_args, _get_exe_version_output +from ._deprecated import * diff --git a/yt_dlp/utils/_deprecated.py b/yt_dlp/utils/_deprecated.py new file mode 100644 index 0000000000..4454d84a72 --- /dev/null +++ b/yt_dlp/utils/_deprecated.py @@ -0,0 +1,30 @@ +"""Deprecated - New code should avoid these""" + +from ._utils import preferredencoding + + +def encodeFilename(s, for_subprocess=False): + assert isinstance(s, str) + return s + + +def decodeFilename(b, for_subprocess=False): + return b + + +def decodeArgument(b): + return b + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, str) + return optval + + +def error_to_compat_str(err): + return str(err) diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py new file mode 100644 index 0000000000..cd009b504c --- /dev/null +++ b/yt_dlp/utils/_legacy.py @@ -0,0 +1,163 @@ +"""No longer used and new code should not use. Exists only for API compat.""" + +import platform +import struct +import sys +import urllib.parse +import zlib + +from ._utils import decode_base_n, preferredencoding +from .traversal import traverse_obj +from ..dependencies import certifi, websockets + +has_certifi = bool(certifi) +has_websockets = bool(websockets) + + +def load_plugins(name, suffix, namespace): + from ..plugins import load_plugins + ret = load_plugins(name, suffix) + namespace.update(ret) + return ret + + +def traverse_dict(dictn, keys, casesense=True): + return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) + + +def decode_base(value, digits): + return decode_base_n(value, table=digits) + + +def platform_name(): + """ Returns the platform name as a str """ + return platform.platform() + + +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + +# UNUSED +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise OSError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise OSError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels + + +def register_socks_protocols(): + # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in urllib.parse.uses_netloc: + urllib.parse.uses_netloc.append(scheme) diff --git a/yt_dlp/utils.py b/yt_dlp/utils/_utils.py similarity index 92% rename from yt_dlp/utils.py rename to yt_dlp/utils/_utils.py index 190af1b7d7..f032af9014 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils/_utils.py @@ -47,26 +47,18 @@ import xml.etree.ElementTree import zlib -from .compat import functools # isort: split -from .compat import ( +from . import traversal + +from ..compat import functools # isort: split +from ..compat import ( compat_etree_fromstring, compat_expanduser, compat_HTMLParseError, compat_os_name, compat_shlex_quote, ) -from .dependencies import brotli, certifi, websockets, xattr -from .socks import ProxyType, sockssocket - - -def register_socks_protocols(): - # "Register" SOCKS protocols - # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 - # URLs with protocols not in urlparse.uses_netloc are not handled correctly - for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in urllib.parse.uses_netloc: - urllib.parse.uses_netloc.append(scheme) - +from ..dependencies import brotli, certifi, websockets, xattr +from ..socks import ProxyType, sockssocket # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -928,27 +920,6 @@ def run(cls, *args, timeout=None, **kwargs): return stdout or default, stderr or default, proc.returncode -def get_subprocess_encoding(): - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return encoding - - -def encodeFilename(s, for_subprocess=False): - assert isinstance(s, str) - return s - - -def decodeFilename(b, for_subprocess=False): - return b - - def encodeArgument(s): # Legacy code that uses byte strings # Uncomment the following line after fixing all post processors @@ -956,20 +927,6 @@ def encodeArgument(s): return s if isinstance(s, str) else s.decode('ascii') -def decodeArgument(b): - return b - - -def decodeOption(optval): - if optval is None: - return optval - if isinstance(optval, bytes): - optval = optval.decode(preferredencoding()) - - assert isinstance(optval, str) - return optval - - _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds')) @@ -1034,7 +991,7 @@ def make_HTTPS_handler(params, **kwargs): context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE if opts_check_certificate: - if has_certifi and 'no-certifi' not in params.get('compat_opts', []): + if certifi and 'no-certifi' not in params.get('compat_opts', []): context.load_verify_locations(cafile=certifi.where()) else: try: @@ -1068,7 +1025,7 @@ def make_HTTPS_handler(params, **kwargs): def bug_reports_message(before=';'): - from .update import REPOSITORY + from ..update import REPOSITORY msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , ' 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U') @@ -2019,12 +1976,6 @@ def __eq__(self, other): and self.start == other.start and self.end == other.end) -def platform_name(): - """ Returns the platform name as a str """ - deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead') - return platform.platform() - - @functools.cache def system_identifier(): python_implementation = platform.python_implementation() @@ -2076,7 +2027,7 @@ def write_string(s, out=None, encoding=None): def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): - from . import _IN_CLI + from .. import _IN_CLI if _IN_CLI: if msg in deprecation_warning._cache: return @@ -3284,13 +3235,6 @@ def variadic(x, allowed_types=NO_DEFAULT): return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) -def dict_get(d, key_or_keys, default=None, skip_false_values=True): - for val in map(d.get, variadic(key_or_keys)): - if val is not None and (val or not skip_false_values): - return val - return default - - def try_call(*funcs, expected_type=None, args=[], kwargs={}): for f in funcs: try: @@ -3528,7 +3472,7 @@ def is_outdated_version(version, limit, assume_new=True): def ytdl_is_updateable(): """ Returns if yt-dlp can be updated with -U """ - from .update import is_non_updateable + from ..update import is_non_updateable return not is_non_updateable() @@ -3538,10 +3482,6 @@ def args_to_str(args): return ' '.join(compat_shlex_quote(a) for a in args) -def error_to_compat_str(err): - return str(err) - - def error_to_str(err): return f'{type(err).__name__}: {err}' @@ -3628,7 +3568,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): mimetype = mt.partition(';')[0].strip().lower() _, _, subtype = mimetype.rpartition('/') - ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) + ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) if ext: return ext elif default is not NO_DEFAULT: @@ -3660,7 +3600,7 @@ def parse_codecs(codecs_str): vcodec = full_codec if parts[0] in ('dvh1', 'dvhe'): hdr = 'DV' - elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10': + elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10': hdr = 'HDR10' elif parts[:2] == ['vp9', '2']: hdr = 'HDR10' @@ -3706,8 +3646,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): }, } - sanitize_codec = functools.partial( - try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower()) + sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) for ext in preferences or COMPATIBLE_CODECS.keys(): @@ -5088,12 +5027,6 @@ def decode_base_n(string, n=None, table=None): return result -def decode_base(value, digits): - deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed ' - f'in a future version. Use {__name__}.decode_base_n instead') - return decode_base_n(value, table=digits) - - def decode_packed_codes(code): mobj = re.search(PACKED_CODES_RE, code) obfuscated_code, base, count, symbols = mobj.groups() @@ -5138,113 +5071,6 @@ def urshift(val, n): return val >> n if val >= 0 else (val + 0x100000000) >> n -# Based on png2str() written by @gdkchan and improved by @yokrysty -# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 -def decode_png(png_data): - # Reference: https://www.w3.org/TR/PNG/ - header = png_data[8:] - - if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': - raise OSError('Not a valid PNG file.') - - int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] - - chunks = [] - - while header: - length = unpack_integer(header[:4]) - header = header[4:] - - chunk_type = header[:4] - header = header[4:] - - chunk_data = header[:length] - header = header[length:] - - header = header[4:] # Skip CRC - - chunks.append({ - 'type': chunk_type, - 'length': length, - 'data': chunk_data - }) - - ihdr = chunks[0]['data'] - - width = unpack_integer(ihdr[:4]) - height = unpack_integer(ihdr[4:8]) - - idat = b'' - - for chunk in chunks: - if chunk['type'] == b'IDAT': - idat += chunk['data'] - - if not idat: - raise OSError('Unable to read PNG data.') - - decompressed_data = bytearray(zlib.decompress(idat)) - - stride = width * 3 - pixels = [] - - def _get_pixel(idx): - x = idx % stride - y = idx // stride - return pixels[y][x] - - for y in range(height): - basePos = y * (1 + stride) - filter_type = decompressed_data[basePos] - - current_row = [] - - pixels.append(current_row) - - for x in range(stride): - color = decompressed_data[1 + basePos + x] - basex = y * stride + x - left = 0 - up = 0 - - if x > 2: - left = _get_pixel(basex - 3) - if y > 0: - up = _get_pixel(basex - stride) - - if filter_type == 1: # Sub - color = (color + left) & 0xff - elif filter_type == 2: # Up - color = (color + up) & 0xff - elif filter_type == 3: # Average - color = (color + ((left + up) >> 1)) & 0xff - elif filter_type == 4: # Paeth - a = left - b = up - c = 0 - - if x > 2 and y > 0: - c = _get_pixel(basex - stride - 3) - - p = a + b - c - - pa = abs(p - a) - pb = abs(p - b) - pc = abs(p - c) - - if pa <= pb and pa <= pc: - color = (color + a) & 0xff - elif pb <= pc: - color = (color + b) & 0xff - else: - color = (color + c) & 0xff - - current_row.append(color) - - return width, height, pixels - - def write_xattr(path, key, value): # Windows: Write xattrs to NTFS Alternate Data Streams: # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 @@ -5403,7 +5229,7 @@ def to_high_limit_path(path): def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): - val = traverse_obj(obj, *variadic(field)) + val = traversal.traverse_obj(obj, *variadic(field)) if not val if ignore is NO_DEFAULT else val in variadic(ignore): return default return template % func(val) @@ -5441,12 +5267,12 @@ def make_dir(path, to_screen=None): return True except OSError as err: if callable(to_screen) is not None: - to_screen('unable to create directory ' + error_to_compat_str(err)) + to_screen(f'unable to create directory {err}') return False def get_executable_path(): - from .update import _get_variant_and_executable_path + from ..update import _get_variant_and_executable_path return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) @@ -5470,244 +5296,6 @@ def get_system_config_dirs(package_name): yield os.path.join('/etc', package_name) -def traverse_obj( - obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, - casesense=True, is_user_input=False, traverse_string=False): - """ - Safely traverse nested `dict`s and `Iterable`s - - >>> obj = [{}, {"key": "value"}] - >>> traverse_obj(obj, (1, "key")) - "value" - - Each of the provided `paths` is tested and the first producing a valid result will be returned. - The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. - Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. - - The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. - - The keys in the path can be one of: - - `None`: Return the current object. - - `set`: Requires the only item in the set to be a type or function, - like `{type}`/`{func}`. If a `type`, returns only values - of this type. If a function, returns `func(obj)`. - - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - - `slice`: Branch out and return all values in `obj[key]`. - - `Ellipsis`: Branch out and return a list of all values. - - `tuple`/`list`: Branch out and return a list of all matching values. - Read as: `[traverse_obj(obj, branch) for branch in branches]`. - - `function`: Branch out and return values filtered by the function. - Read as: `[value for key, value in obj if function(key, value)]`. - For `Iterable`s, `key` is the index of the value. - For `re.Match`es, `key` is the group number (0 = full match) - as well as additionally any group names, if given. - - `dict` Transform the current object and return a matching dict. - Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - - `tuple`, `list`, and `dict` all support nested paths and branches. - - @params paths Paths which to traverse by. - @param default Value to return if the paths do not match. - If the last key in the path is a `dict`, it will apply to each value inside - the dict instead, depth first. Try to avoid if using nested `dict` keys. - @param expected_type If a `type`, only accept final values of this type. - If any other callable, try to call the function on each result. - If the last key in the path is a `dict`, it will apply to each value inside - the dict instead, recursively. This does respect branching paths. - @param get_all If `False`, return the first matching result, otherwise all matching ones. - @param casesense If `False`, consider string dictionary keys as case insensitive. - - The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API - - @param is_user_input Whether the keys are generated from user input. - If `True` strings get converted to `int`/`slice` if needed. - @param traverse_string Whether to traverse into objects as strings. - If `True`, any non-compatible object will first be - converted into a string and then traversed into. - The return value of that path will be a string instead, - not respecting any further branching. - - - @returns The result of the object traversal. - If successful, `get_all=True`, and the path branches at least once, - then a list of results is returned instead. - If no `default` is given and the last path branches, a `list` of results - is always returned. If a path ends on a `dict` that result will always be a `dict`. - """ - casefold = lambda k: k.casefold() if isinstance(k, str) else k - - if isinstance(expected_type, type): - type_test = lambda val: val if isinstance(val, expected_type) else None - else: - type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) - - def apply_key(key, obj, is_last): - branching = False - result = None - - if obj is None and traverse_string: - if key is ... or callable(key) or isinstance(key, slice): - branching = True - result = () - - elif key is None: - result = obj - - elif isinstance(key, set): - assert len(key) == 1, 'Set should only be used to wrap a single item' - item = next(iter(key)) - if isinstance(item, type): - if isinstance(obj, item): - result = obj - else: - result = try_call(item, args=(obj,)) - - elif isinstance(key, (list, tuple)): - branching = True - result = itertools.chain.from_iterable( - apply_path(obj, branch, is_last)[0] for branch in key) - - elif key is ...: - branching = True - if isinstance(obj, collections.abc.Mapping): - result = obj.values() - elif is_iterable_like(obj): - result = obj - elif isinstance(obj, re.Match): - result = obj.groups() - elif traverse_string: - branching = False - result = str(obj) - else: - result = () - - elif callable(key): - branching = True - if isinstance(obj, collections.abc.Mapping): - iter_obj = obj.items() - elif is_iterable_like(obj): - iter_obj = enumerate(obj) - elif isinstance(obj, re.Match): - iter_obj = itertools.chain( - enumerate((obj.group(), *obj.groups())), - obj.groupdict().items()) - elif traverse_string: - branching = False - iter_obj = enumerate(str(obj)) - else: - iter_obj = () - - result = (v for k, v in iter_obj if try_call(key, args=(k, v))) - if not branching: # string traversal - result = ''.join(result) - - elif isinstance(key, dict): - iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) - result = { - k: v if v is not None else default for k, v in iter_obj - if v is not None or default is not NO_DEFAULT - } or None - - elif isinstance(obj, collections.abc.Mapping): - result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else - next((v for k, v in obj.items() if casefold(k) == key), None)) - - elif isinstance(obj, re.Match): - if isinstance(key, int) or casesense: - with contextlib.suppress(IndexError): - result = obj.group(key) - - elif isinstance(key, str): - result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) - - elif isinstance(key, (int, slice)): - if is_iterable_like(obj, collections.abc.Sequence): - branching = isinstance(key, slice) - with contextlib.suppress(IndexError): - result = obj[key] - elif traverse_string: - with contextlib.suppress(IndexError): - result = str(obj)[key] - - return branching, result if branching else (result,) - - def lazy_last(iterable): - iterator = iter(iterable) - prev = next(iterator, NO_DEFAULT) - if prev is NO_DEFAULT: - return - - for item in iterator: - yield False, prev - prev = item - - yield True, prev - - def apply_path(start_obj, path, test_type): - objs = (start_obj,) - has_branched = False - - key = None - for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): - if is_user_input and isinstance(key, str): - if key == ':': - key = ... - elif ':' in key: - key = slice(*map(int_or_none, key.split(':'))) - elif int_or_none(key) is not None: - key = int(key) - - if not casesense and isinstance(key, str): - key = key.casefold() - - if __debug__ and callable(key): - # Verify function signature - inspect.signature(key).bind(None, None) - - new_objs = [] - for obj in objs: - branching, results = apply_key(key, obj, last) - has_branched |= branching - new_objs.append(results) - - objs = itertools.chain.from_iterable(new_objs) - - if test_type and not isinstance(key, (dict, list, tuple)): - objs = map(type_test, objs) - - return objs, has_branched, isinstance(key, dict) - - def _traverse_obj(obj, path, allow_empty, test_type): - results, has_branched, is_dict = apply_path(obj, path, test_type) - results = LazyList(item for item in results if item not in (None, {})) - if get_all and has_branched: - if results: - return results.exhaust() - if allow_empty: - return [] if default is NO_DEFAULT else default - return None - - return results[0] if results else {} if allow_empty and is_dict else None - - for index, path in enumerate(paths, 1): - result = _traverse_obj(obj, path, index == len(paths), True) - if result is not None: - return result - - return None if default is NO_DEFAULT else default - - -def traverse_dict(dictn, keys, casesense=True): - deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed ' - f'in a future version. Use "{__name__}.traverse_obj" instead') - return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) - - -def get_first(obj, *paths, **kwargs): - return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) - - def time_seconds(**kwargs): """ Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z) @@ -5803,7 +5391,7 @@ def number_of_digits(number): def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: - values = (traverse_obj(from_dict, variadic(v)) for v in values) + values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(str, filter(None, values))) @@ -6514,15 +6102,3 @@ def calculate_preference(self, format): format['abr'] = format.get('tbr') - format.get('vbr', 0) return tuple(self._calculate_field_preference(format, field) for field in self._order) - - -# Deprecated -has_certifi = bool(certifi) -has_websockets = bool(websockets) - - -def load_plugins(name, suffix, namespace): - from .plugins import load_plugins - ret = load_plugins(name, suffix) - namespace.update(ret) - return ret diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py new file mode 100644 index 0000000000..462c3ba5df --- /dev/null +++ b/yt_dlp/utils/traversal.py @@ -0,0 +1,254 @@ +import collections.abc +import contextlib +import inspect +import itertools +import re + +from ._utils import ( + IDENTITY, + NO_DEFAULT, + LazyList, + int_or_none, + is_iterable_like, + try_call, + variadic, +) + + +def traverse_obj( + obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, + casesense=True, is_user_input=False, traverse_string=False): + """ + Safely traverse nested `dict`s and `Iterable`s + + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" + + Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. + Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. + + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. + + The keys in the path can be one of: + - `None`: Return the current object. + - `set`: Requires the only item in the set to be a type or function, + like `{type}`/`{func}`. If a `type`, returns only values + of this type. If a function, returns `func(obj)`. + - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Iterable`s, `key` is the index of the value. + For `re.Match`es, `key` is the group number (0 = full match) + as well as additionally any group names, if given. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches. + + @params paths Paths which to traverse by. + @param default Value to return if the paths do not match. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, depth first. Try to avoid if using nested `dict` keys. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + If the last key in the path is a `dict`, it will apply to each value inside + the dict instead, recursively. This does respect branching paths. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + The return value of that path will be a string instead, + not respecting any further branching. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + If no `default` is given and the last path branches, a `list` of results + is always returned. If a path ends on a `dict` that result will always be a `dict`. + """ + casefold = lambda k: k.casefold() if isinstance(k, str) else k + + if isinstance(expected_type, type): + type_test = lambda val: val if isinstance(val, expected_type) else None + else: + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + + def apply_key(key, obj, is_last): + branching = False + result = None + + if obj is None and traverse_string: + if key is ... or callable(key) or isinstance(key, slice): + branching = True + result = () + + elif key is None: + result = obj + + elif isinstance(key, set): + assert len(key) == 1, 'Set should only be used to wrap a single item' + item = next(iter(key)) + if isinstance(item, type): + if isinstance(obj, item): + result = obj + else: + result = try_call(item, args=(obj,)) + + elif isinstance(key, (list, tuple)): + branching = True + result = itertools.chain.from_iterable( + apply_path(obj, branch, is_last)[0] for branch in key) + + elif key is ...: + branching = True + if isinstance(obj, collections.abc.Mapping): + result = obj.values() + elif is_iterable_like(obj): + result = obj + elif isinstance(obj, re.Match): + result = obj.groups() + elif traverse_string: + branching = False + result = str(obj) + else: + result = () + + elif callable(key): + branching = True + if isinstance(obj, collections.abc.Mapping): + iter_obj = obj.items() + elif is_iterable_like(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, re.Match): + iter_obj = itertools.chain( + enumerate((obj.group(), *obj.groups())), + obj.groupdict().items()) + elif traverse_string: + branching = False + iter_obj = enumerate(str(obj)) + else: + iter_obj = () + + result = (v for k, v in iter_obj if try_call(key, args=(k, v))) + if not branching: # string traversal + result = ''.join(result) + + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items()) + result = { + k: v if v is not None else default for k, v in iter_obj + if v is not None or default is not NO_DEFAULT + } or None + + elif isinstance(obj, collections.abc.Mapping): + result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else + next((v for k, v in obj.items() if casefold(k) == key), None)) + + elif isinstance(obj, re.Match): + if isinstance(key, int) or casesense: + with contextlib.suppress(IndexError): + result = obj.group(key) + + elif isinstance(key, str): + result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + + elif isinstance(key, (int, slice)): + if is_iterable_like(obj, collections.abc.Sequence): + branching = isinstance(key, slice) + with contextlib.suppress(IndexError): + result = obj[key] + elif traverse_string: + with contextlib.suppress(IndexError): + result = str(obj)[key] + + return branching, result if branching else (result,) + + def lazy_last(iterable): + iterator = iter(iterable) + prev = next(iterator, NO_DEFAULT) + if prev is NO_DEFAULT: + return + + for item in iterator: + yield False, prev + prev = item + + yield True, prev + + def apply_path(start_obj, path, test_type): + objs = (start_obj,) + has_branched = False + + key = None + for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): + if is_user_input and isinstance(key, str): + if key == ':': + key = ... + elif ':' in key: + key = slice(*map(int_or_none, key.split(':'))) + elif int_or_none(key) is not None: + key = int(key) + + if not casesense and isinstance(key, str): + key = key.casefold() + + if __debug__ and callable(key): + # Verify function signature + inspect.signature(key).bind(None, None) + + new_objs = [] + for obj in objs: + branching, results = apply_key(key, obj, last) + has_branched |= branching + new_objs.append(results) + + objs = itertools.chain.from_iterable(new_objs) + + if test_type and not isinstance(key, (dict, list, tuple)): + objs = map(type_test, objs) + + return objs, has_branched, isinstance(key, dict) + + def _traverse_obj(obj, path, allow_empty, test_type): + results, has_branched, is_dict = apply_path(obj, path, test_type) + results = LazyList(item for item in results if item not in (None, {})) + if get_all and has_branched: + if results: + return results.exhaust() + if allow_empty: + return [] if default is NO_DEFAULT else default + return None + + return results[0] if results else {} if allow_empty and is_dict else None + + for index, path in enumerate(paths, 1): + result = _traverse_obj(obj, path, index == len(paths), True) + if result is not None: + return result + + return None if default is NO_DEFAULT else default + + +def get_first(obj, *paths, **kwargs): + return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) + + +def dict_get(d, key_or_keys, default=None, skip_false_values=True): + for val in map(d.get, variadic(key_or_keys)): + if val is not None and (val or not skip_false_values): + return val + return default From 955c89584b66fcd0fcfab3e611f1edeb1ca63886 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 21 May 2023 10:55:09 +1200 Subject: [PATCH 097/501] [core] Deprecate internal `Youtubedl-no-compression` header (#6876) Authored by: coletdjnz --- yt_dlp/YoutubeDL.py | 4 +++- yt_dlp/downloader/external.py | 4 +--- yt_dlp/downloader/http.py | 4 ++-- yt_dlp/extractor/litv.py | 2 +- yt_dlp/utils/_legacy.py | 10 ++++++++++ yt_dlp/utils/_utils.py | 23 ++++++----------------- 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index b8f1a05a09..1162d2df1a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2380,7 +2380,9 @@ def restore_last_token(self): def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - + if 'Youtubedl-No-Compression' in res: # deprecated + res.pop('Youtubedl-No-Compression', None) + res['Accept-Encoding'] = 'identity' cookies = self._calc_cookies(info_dict['url']) if cookies: res['Cookie'] = cookies diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ee130c8270..007689a8c9 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -23,7 +23,6 @@ encodeArgument, encodeFilename, find_available_port, - handle_youtubedl_headers, remove_end, sanitized_Request, traverse_obj, @@ -529,10 +528,9 @@ def _call_downloader(self, tmpfilename, info_dict): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): - headers_dict = handle_youtubedl_headers(fmt['http_headers']) # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in headers_dict.items())]) + args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())]) if start_time: args += ['-ss', str(start_time)] diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index fa72d5722a..79f69b5d02 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -45,8 +45,8 @@ class DownloadContext(dict): ctx.tmpfilename = self.temp_name(filename) ctx.stream = None - # Do not include the Accept-Encoding header - headers = {'Youtubedl-no-compression': 'True'} + # Disable compression + headers = {'Accept-Encoding': 'identity'} add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 31826ac99e..0b792fb96f 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -113,7 +113,7 @@ def _real_extract(self, url): entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity' title = program_info['title'] + program_info.get('secondaryMark', '') description = program_info.get('description') diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index cd009b504c..b0578a1d6b 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -161,3 +161,13 @@ def register_socks_protocols(): for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): if scheme not in urllib.parse.uses_netloc: urllib.parse.uses_netloc.append(scheme) + + +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index f032af9014..9f1a127cdb 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1308,25 +1308,12 @@ def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_a return hc -def handle_youtubedl_headers(headers): - filtered_headers = headers - - if 'Youtubedl-no-compression' in filtered_headers: - filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} - del filtered_headers['Youtubedl-no-compression'] - - return filtered_headers - - class YoutubeDLHandler(urllib.request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-no-compression", which will be - removed before making the real request. + the standard headers to every HTTP request and handles gzipped, deflated and + brotli responses from web servers. Part of this code was copied from: @@ -1389,11 +1376,13 @@ def http_request(self, req): if h.capitalize() not in req.headers: req.add_header(h, v) + if 'Youtubedl-no-compression' in req.headers: # deprecated + req.headers.pop('Youtubedl-no-compression', None) + req.add_header('Accept-encoding', 'identity') + if 'Accept-encoding' not in req.headers: req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS)) - req.headers = handle_youtubedl_headers(req.headers) - return super().do_request_(req) def http_response(self, req, resp): From 69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2 Mon Sep 17 00:00:00 2001 From: kangalio <jannik.a.schaper@web.de> Date: Mon, 22 May 2023 13:47:06 +0200 Subject: [PATCH 098/501] [extractor/youtube:music:search_url] Extract title (#7102) Authored by: kangalio Closes #7095 --- yt_dlp/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d089822f64..bd38900f2c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4579,8 +4579,11 @@ def _grid_entries(self, grid_renderer): def _music_reponsive_list_entry(self, renderer): video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) if video_id: + title = traverse_obj(renderer, ( + 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer', + 'text', 'runs', 0, 'text')) return self.url_result(f'https://music.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id) + ie=YoutubeIE.ie_key(), video_id=video_id, title=title) playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) if playlist_id: video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) From 46f1370e9af6f8af8762f67e27e5acb8f0c48a47 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 May 2023 23:29:30 +0530 Subject: [PATCH 099/501] [devscripts/cli_to_api] Add script --- devscripts/cli_to_api.py | 48 +++++++++++++++++++++++++++++++++++ yt_dlp/YoutubeDL.py | 8 +++--- yt_dlp/downloader/common.py | 7 ++--- yt_dlp/downloader/fragment.py | 4 +-- yt_dlp/utils/_utils.py | 6 +++-- 5 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 devscripts/cli_to_api.py diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py new file mode 100644 index 0000000000..b8b7cbcf1d --- /dev/null +++ b/devscripts/cli_to_api.py @@ -0,0 +1,48 @@ +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import yt_dlp +import yt_dlp.options + +create_parser = yt_dlp.options.create_parser + + +def parse_patched_options(opts): + patched_parser = create_parser() + patched_parser.defaults.update({ + 'ignoreerrors': False, + 'retries': 0, + 'fragment_retries': 0, + 'extract_flat': False, + 'concat_playlist': 'never', + }) + yt_dlp.options.__dict__['create_parser'] = lambda: patched_parser + try: + return yt_dlp.parse_options(opts) + finally: + yt_dlp.options.__dict__['create_parser'] = create_parser + + +default_opts = parse_patched_options([]).ydl_opts + + +def cli_to_api(opts, cli_defaults=False): + opts = (yt_dlp.parse_options if cli_defaults else parse_patched_options)(opts).ydl_opts + + diff = {k: v for k, v in opts.items() if default_opts[k] != v} + if 'postprocessors' in diff: + diff['postprocessors'] = [pp for pp in diff['postprocessors'] + if pp not in default_opts['postprocessors']] + return diff + + +if __name__ == '__main__': + from pprint import pprint + + print('\nThe arguments passed translate to:\n') + pprint(cli_to_api(sys.argv[1:])) + print('\nCombining these with the CLI defaults gives:\n') + pprint(cli_to_api(sys.argv[1:], True)) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 1162d2df1a..cd82b27727 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -280,7 +280,7 @@ class YoutubeDL: subtitles. The language can be prefixed with a "-" to exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing - daterange: A DateRange object, download only if the upload_date is in the range. + daterange: A utils.DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. False to disable filesystem cache. @@ -329,13 +329,13 @@ class YoutubeDL: 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Whether to resolve and process url_results further - * False: Always process (default) + * False: Always process. Default for API * True: Never process * 'in_playlist': Do not process inside playlist/multi_video * 'discard': Always process, but don't return the result from inside playlist/multi_video * 'discard_in_playlist': Same as "discard", but only for - playlists (not multi_video) + playlists (not multi_video). Default for CLI wait_for_video: If given, wait for scheduled streams to become available. The value should be a tuple containing the range (min_secs, max_secs) to wait between retries @@ -472,7 +472,7 @@ class YoutubeDL: can also be used The following options are used by the extractors: - extractor_retries: Number of times to retry for known errors + extractor_retries: Number of times to retry for known errors (default: 3) dynamic_mpd: Whether to process dynamic DASH manifests (default: True) hls_split_discontinuity: Split HLS playlists to different formats at discontinuities such as ad breaks (default: False) diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 077b29b41f..8f9bc05d6e 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -51,8 +51,9 @@ class FileDownloader: ratelimit: Download speed limit, in bytes/sec. continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) - retries: Number of times to retry for HTTP error 5xx - file_access_retries: Number of times to retry on file access error + retries: Number of times to retry for expected network errors. + Default is 0 for API, but 10 for CLI + file_access_retries: Number of times to retry on file access error (default: 3) buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. @@ -225,7 +226,7 @@ def error_callback(err, count, retries, *, fd): sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) def wrapper(self, func, *args, **kwargs): - for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self): + for retry in RetryManager(self.params.get('file_access_retries', 3), error_callback, fd=self): try: return func(self, *args, **kwargs) except OSError as err: diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 3dc638f523..8abf7760ba 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -34,8 +34,8 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH - and hlsnative only) + fragment_retries: Number of times to retry a fragment for HTTP error + (DASH and hlsnative only). Default is 0 for API, but 10 for CLI skip_unavailable_fragments: Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 9f1a127cdb..afcb2a1642 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -60,6 +60,8 @@ from ..dependencies import brotli, certifi, websockets, xattr from ..socks import ProxyType, sockssocket +__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module + # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -1957,8 +1959,8 @@ def __contains__(self, date): date = date_from_str(date) return self.start <= date <= self.end - def __str__(self): - return f'{self.start.isoformat()} - {self.end.isoformat()}' + def __repr__(self): + return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})' def __eq__(self, other): return (isinstance(other, DateRange) From 4823ec9f461512daa1b8ab362893bb86a6320b26 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 May 2023 23:30:43 +0530 Subject: [PATCH 100/501] Update to ytdl-commit-d1c6c5 [YouTube] [core] Improve platform debug log, based on yt-dlp https://github.com/ytdl-org/youtube-dl/commit/d1c6c5c4d618fa950813c0c71aede34a5ac851e9 Except: * 6ed34338285f722d0da312ce0af3a15a077a3e2a [jsinterp] Add short-cut evaluation for common expression * There was no performance improvement when tested with https://github.com/ytdl-org/youtube-dl/issues/30641 * e8de54bce50f6f77a4d7e8e80675f7003d5bf630 [core] Handle `/../` sequences in HTTP URLs * We plan to implement this differently --- test/test_jsinterp.py | 32 ++++++++++++++++++++++++++------ test/test_utils.py | 32 ++++++++++++++++++++++++++++++++ yt_dlp/downloader/common.py | 24 +++++++++++++++++------- yt_dlp/downloader/fragment.py | 33 ++++++++++++++++++++------------- yt_dlp/downloader/http.py | 3 ++- yt_dlp/extractor/aenetworks.py | 15 +++++++++++++-- yt_dlp/extractor/litv.py | 2 +- yt_dlp/extractor/youtube.py | 10 +++------- yt_dlp/jsinterp.py | 2 +- yt_dlp/utils/_utils.py | 12 ++++++++++-- 10 files changed, 125 insertions(+), 40 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 444909b84b..96274116b9 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -66,9 +66,8 @@ def test_assignments(self): self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) + @unittest.skip('Not implemented') def test_comments(self): - 'Skipping: Not yet fully implemented' - return self._test(''' function f() { var x = /* 1 + */ 2; @@ -100,10 +99,13 @@ def test_builtins(self): jsi = JSInterpreter('function f() { return NaN }') self.assertTrue(math.isnan(jsi.call_function('f'))) - self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', - 86000) - self._test('function f(dt) { return new Date(dt) - 0; }', - 86000, args=['Wednesday 31 December 1969 18:01:26 MDT']) + def test_date(self): + self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', 86000) + + jsi = JSInterpreter('function f(dt) { return new Date(dt) - 0; }') + self.assertEqual(jsi.call_function('f', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + self.assertEqual(jsi.call_function('f', '12/31/1969 18:01:26 MDT'), 86000) # m/d/y + self.assertEqual(jsi.call_function('f', '1 January 1970 00:00:00 UTC'), 0) def test_call(self): jsi = JSInterpreter(''' @@ -286,6 +288,19 @@ def test_regex(self): jsi = JSInterpreter(R'function f() { let a=[/[)\\]/]; return a[0]; }') self.assertEqual(jsi.call_function('f').pattern, r'[)\\]') + @unittest.skip('Not implemented') + def test_replace(self): + self._test('function f() { let a="data-name".replace("data-", ""); return a }', + 'name') + self._test('function f() { let a="data-name".replace(new RegExp("^.+-"), ""); return a; }', + 'name') + self._test('function f() { let a="data-name".replace(/^.+-/, ""); return a; }', + 'name') + self._test('function f() { let a="data-name".replace(/a/g, "o"); return a; }', + 'doto-nome') + self._test('function f() { let a="data-name".replaceAll("a", "o"); return a; }', + 'doto-nome') + def test_char_code_at(self): jsi = JSInterpreter('function f(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('f', 0), 116) @@ -311,6 +326,11 @@ def test_negative(self): self._test('function f(){return 2 - + + - -2;}', 0) self._test('function f(){return 2 + - + - -2;}', 0) + @unittest.skip('Not implemented') + def test_packed(self): + jsi = JSInterpreter('''function f(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') + self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("<q />").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index e1bf6ac20f..a22f25d730 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,6 +5,7 @@ import re import sys import unittest +import warnings sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -112,6 +113,7 @@ subtitles_filename, timeconvert, traverse_obj, + try_call, unescapeHTML, unified_strdate, unified_timestamp, @@ -123,6 +125,7 @@ urlencode_postdata, urljoin, urshift, + variadic, version_tuple, xpath_attr, xpath_element, @@ -1974,6 +1977,35 @@ def test_get_compatible_ext(self): self.assertEqual(get_compatible_ext( vcodecs=['av1'], acodecs=['mp4a'], vexts=['webm'], aexts=['m4a'], preferences=('webm', 'mkv')), 'mkv') + def test_try_call(self): + def total(*x, **kwargs): + return sum(x) + sum(kwargs.values()) + + self.assertEqual(try_call(None), None, + msg='not a fn should give None') + self.assertEqual(try_call(lambda: 1), 1, + msg='int fn with no expected_type should give int') + self.assertEqual(try_call(lambda: 1, expected_type=int), 1, + msg='int fn with expected_type int should give int') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with wrong expected_type should give None') + self.assertEqual(try_call(total, args=(0, 1, 0, ), expected_type=int), 1, + msg='fn should accept arglist') + self.assertEqual(try_call(total, kwargs={'a': 0, 'b': 1, 'c': 0}, expected_type=int), 1, + msg='fn should accept kwargs') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with no expected_type should give None') + self.assertEqual(try_call(lambda x: {}, total, args=(42, ), expected_type=int), 42, + msg='expect first int result with expected_type int') + + def test_variadic(self): + self.assertEqual(variadic(None), (None, )) + self.assertEqual(variadic('spam'), ('spam', )) + self.assertEqual(variadic('spam', allowed_types=dict), 'spam') + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam') + def test_traverse_obj(self): _TEST_DATA = { 100: 100, diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 8f9bc05d6e..c48a2ff8ac 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -139,17 +139,21 @@ def calc_percent(byte_counter, data_len): def format_percent(percent): return ' N/A%' if percent is None else f'{percent:>5.1f}%' - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, total=NO_DEFAULT, current=NO_DEFAULT): + if total is NO_DEFAULT: + rate, remaining = start_or_rate, now_or_remaining + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + + start, now = start_or_rate, now_or_remaining if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def calc_speed(start, now, bytes): @@ -166,6 +170,12 @@ def format_speed(speed): def format_retries(retries): return 'inf' if retries == float('inf') else int(retries) + @staticmethod + def filesize_or_none(unencoded_filename): + if os.path.isfile(unencoded_filename): + return os.path.getsize(unencoded_filename) + return 0 + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 8abf7760ba..6770815abb 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -121,6 +121,11 @@ def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_dat 'request_data': request_data, 'ctx_id': ctx.get('ctx_id'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none(self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = ctx['frag_resume_len'] = frag_resume_len + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False @@ -155,9 +160,7 @@ def _append_fragment(self, ctx, frag_content): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -173,12 +176,11 @@ def _prepare_frag_download(self, ctx): }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -187,7 +189,9 @@ def _prepare_frag_download(self, ctx): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + continuedl = self.params.get('continuedl', True) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -201,7 +205,12 @@ def _prepare_frag_download(self, ctx): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -274,12 +283,10 @@ def frag_progress_hook(s): else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes) + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) + if not ctx['live']: + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) @@ -297,7 +304,7 @@ def _finish_frag_download(self, ctx, info_dict): to_file = ctx['tmpfilename'] != '-' if to_file: - downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) + downloaded_bytes = self.filesize_or_none(ctx['filename']) else: downloaded_bytes = ctx['complete_frags_downloaded_bytes'] diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 79f69b5d02..e785f0d4ed 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -150,7 +150,8 @@ def establish_connection(): # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload - self.report_unable_to_resume() + elif range_start > 0: + self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index d7c401016c..f049a0fb3c 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -3,6 +3,8 @@ ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -72,7 +74,14 @@ def _extract_aetn_info(self, domain, filter_key, filter_value, url): requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -123,7 +132,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -140,6 +149,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -303,6 +313,7 @@ def _real_extract(self, url): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 0b792fb96f..19b298ec6c 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -4,8 +4,8 @@ from ..utils import ( ExtractorError, int_or_none, - traverse_obj, smuggle_url, + traverse_obj, unsmuggle_url, ) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index bd38900f2c..654bf5e6b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -66,7 +66,6 @@ variadic, ) - STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { @@ -2994,17 +2993,14 @@ def _parse_sig_js(self, jscode): r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -4883,7 +4879,7 @@ def _extract_metadata_from_tabs(self, item_id, data): metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) if metadata_renderer: channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}), - ('channelUrl', {self.ucid_from_url})) + ('channelUrl', {self.ucid_from_url})) info.update({ 'channel': metadata_renderer.get('title'), 'channel_id': channel_id, diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 82974fb27b..1ef1f0823a 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -443,7 +443,7 @@ def dict_item(key, val): err = e pending = (None, False) - m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + m = re.match(fr'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{', expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) if err: diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index afcb2a1642..238b0fe694 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -130,8 +130,13 @@ def random_user_agent(): } -NO_DEFAULT = object() -IDENTITY = lambda x: x +class NO_DEFAULT: + pass + + +def IDENTITY(x): + return x + ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', @@ -3223,6 +3228,9 @@ def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO def variadic(x, allowed_types=NO_DEFAULT): + if not isinstance(allowed_types, (tuple, type)): + deprecation_warning('allowed_types should be a tuple or a type') + allowed_types = tuple(allowed_types) return x if is_iterable_like(x, blocked_types=allowed_types) else (x, ) From 15b2d3db1d40b0437fca79d8874d392aa54b3cdd Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 May 2023 22:13:24 +0530 Subject: [PATCH 101/501] [misc] Add automatic duplicate issue detection --- .github/workflows/potential-duplicates.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/potential-duplicates.yml diff --git a/.github/workflows/potential-duplicates.yml b/.github/workflows/potential-duplicates.yml new file mode 100644 index 0000000000..1521ae20c0 --- /dev/null +++ b/.github/workflows/potential-duplicates.yml @@ -0,0 +1,20 @@ +name: Potential Duplicates +on: + issues: + types: [opened, edited] + +jobs: + run: + runs-on: ubuntu-latest + steps: + - uses: wow-actions/potential-duplicates@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + label: potential-duplicate + state: all + threshold: 0.7 + comment: | + This issue is potentially a duplicate of one of the following issues: + {{#issues}} + - #{{ number }} ({{ accuracy }}%) + {{/issues}} From 7aeda6cc9e73ada0b0a0b6a6748c66bef63a20a8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 May 2023 23:05:20 +0530 Subject: [PATCH 102/501] [jsinterp] Do not compile regex --- test/test_jsinterp.py | 4 +++- yt_dlp/jsinterp.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 96274116b9..4d44e6efe6 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -8,7 +8,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math -import re from yt_dlp.jsinterp import JS_Undefined, JSInterpreter @@ -275,7 +274,9 @@ def test_object(self): def test_regex(self): self._test('function f() { let a=/,,[/,913,/](,)}/; }', None) + self._test('function f() { let a=/,,[/,913,/](,)}/; return a; }', R'/,,[/,913,/](,)}/0') + R''' # We are not compiling regex jsi = JSInterpreter('function f() { let a=/,,[/,913,/](,)}/; return a; }') self.assertIsInstance(jsi.call_function('f'), re.Pattern) @@ -287,6 +288,7 @@ def test_regex(self): jsi = JSInterpreter(R'function f() { let a=[/[)\\]/]; return a[0]; }') self.assertEqual(jsi.call_function('f').pattern, r'[)\\]') + ''' @unittest.skip('Not implemented') def test_replace(self): diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 1ef1f0823a..7c7940efd5 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -352,8 +352,10 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': flags, outer = self._regex_flags(outer) + # We don't support regex methods yet, so no point compiling it + inner = f'{inner}/{flags}' # Avoid https://github.com/python/cpython/issues/74534 - inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) + # inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) else: inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) if not outer: From 8417f26b8a819cd7ffcd4e000ca3e45033e670fb Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Wed, 24 May 2023 20:35:07 +0200 Subject: [PATCH 103/501] [core] Implement `--color` flag (#6904) Authored by: Grub4K --- README.md | 9 +++++++-- yt_dlp/YoutubeDL.py | 36 +++++++++++++++++++++++++++++++----- yt_dlp/__init__.py | 6 +++++- yt_dlp/downloader/common.py | 3 ++- yt_dlp/options.py | 24 +++++++++++++++++++++--- 5 files changed, 66 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d0eaba7477..25ed3b8441 100644 --- a/README.md +++ b/README.md @@ -425,8 +425,12 @@ ## General Options: --no-wait-for-video Do not wait for scheduled streams (default) --mark-watched Mark videos watched (even with --simulate) --no-mark-watched Do not mark videos watched (default) - --no-colors Do not emit color codes in output (Alias: - --no-colours) + --color [STREAM:]POLICY Whether to emit color codes in output, + optionally prefixed by the STREAM (stdout or + stderr) to apply the setting to. Can be one + of "always", "auto" (default), "never", or + "no_color" (use non color terminal + sequences). Can be used multiple times --compat-options OPTS Options that can help keep compatibility with youtube-dl or youtube-dlc configurations by reverting some of the @@ -2148,6 +2152,7 @@ #### Redundant options --playlist-end NUMBER -I :NUMBER --playlist-reverse -I ::-1 --no-playlist-reverse Default + --no-colors --color no_color #### Not recommended diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index cd82b27727..e1e5588363 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -415,7 +415,12 @@ class YoutubeDL: - Raise utils.DownloadCancelled(msg) to abort remaining downloads when a video is rejected. match_filter_func in utils.py is one example for this. - no_color: Do not emit color codes in output. + color: A Dictionary with output stream names as keys + and their respective color policy as values. + Can also just be a single color policy, + in which case it applies to all outputs. + Valid stream names are 'stdout' and 'stderr'. + Valid color policies are one of 'always', 'auto', 'no_color' or 'never'. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For HTTP header geo_bypass_country: @@ -537,6 +542,7 @@ class YoutubeDL: data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) + no_color: Same as `color='no_color'` """ _NUMERIC_FIELDS = { @@ -603,9 +609,24 @@ def __init__(self, params=None, auto_init=True): except Exception as e: self.write_debug(f'Failed to enable VT mode: {e}') + if self.params.get('no_color'): + if self.params.get('color') is not None: + self.report_warning('Overwriting params from "color" with "no_color"') + self.params['color'] = 'no_color' + + term_allow_color = os.environ.get('TERM', '').lower() != 'dumb' + + def process_color_policy(stream): + stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] + policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) + if policy in ('auto', None): + return term_allow_color and supports_terminal_sequences(stream) + assert policy in ('always', 'never', 'no_color') + return {'always': True, 'never': False}.get(policy, policy) + self._allow_colors = Namespace(**{ - type_: not self.params.get('no_color') and supports_terminal_sequences(stream) - for type_, stream in self._out_files.items_ if type_ != 'console' + name: process_color_policy(stream) + for name, stream in self._out_files.items_ if name != 'console' }) # The code is left like this to be reused for future deprecations @@ -974,7 +995,7 @@ def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_enc text = text.encode(encoding, 'ignore').decode(encoding) if fallback is not None and text != original_text: text = fallback - return format_text(text, f) if allow_colors else text if fallback is None else fallback + return format_text(text, f) if allow_colors is True else text if fallback is None else fallback def _format_out(self, *args, **kwargs): return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) @@ -3769,9 +3790,14 @@ def print_debug_header(self): def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) + additional_info = [] + if os.environ.get('TERM', '').lower() == 'dumb': + additional_info.append('dumb') if not supports_terminal_sequences(stream): from .utils import WINDOWS_VT_MODE # Must be imported locally - ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' + additional_info.append('No VT' if WINDOWS_VT_MODE is False else 'No ANSI') + if additional_info: + ret = f'{ret} ({",".join(additional_info)})' return ret encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 9563d784aa..137c9503f6 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -436,6 +436,10 @@ def metadataparser_actions(f): elif ed and proto == 'default': default_downloader = ed.get_basename() + for policy in opts.color.values(): + if policy not in ('always', 'auto', 'no_color', 'never'): + raise ValueError(f'"{policy}" is not a valid color policy') + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -894,7 +898,7 @@ def parse_options(argv=None): 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': opts.match_filter, - 'no_color': opts.no_color, + 'color': opts.color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, 'hls_use_mpegts': opts.hls_use_mpegts, diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index c48a2ff8ac..477ec3c8a0 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -296,7 +296,8 @@ def _prepare_multiline_status(self, lines=1): self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) - self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') + self._multiline.allow_colors = self.ydl._allow_colors.out and self.ydl._allow_colors.out != 'no_color' + self._multiline._HAVE_FULLCAP = self.ydl._allow_colors.out def _finish_multiline_status(self): self._multiline.end() diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 838d79fcb1..fecc274031 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -34,6 +34,7 @@ join_nonempty, orderedSet_from_options, remove_end, + variadic, write_string, ) from .version import CHANNEL, __version__ @@ -250,7 +251,7 @@ def _dict_from_options_callback( if multiple_args: val = [val, *value[1:]] elif default_key is not None: - keys, val = [default_key], value + keys, val = variadic(default_key), value else: raise optparse.OptionValueError( f'wrong {opt_str} formatting; it should be {option.metavar}, not "{value}"') @@ -440,8 +441,25 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not mark videos watched (default)') general.add_option( '--no-colors', '--no-colours', - action='store_true', dest='no_color', default=False, - help='Do not emit color codes in output (Alias: --no-colours)') + action='store_const', dest='color', const={ + 'stdout': 'no_color', + 'stderr': 'no_color', + }, + help=optparse.SUPPRESS_HELP) + general.add_option( + '--color', + dest='color', metavar='[STREAM:]POLICY', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'stdout|stderr', + 'default_key': ['stdout', 'stderr'], + 'process': str.strip, + }, help=( + 'Whether to emit color codes in output, optionally prefixed by ' + 'the STREAM (stdout or stderr) to apply the setting to. ' + 'Can be one of "always", "auto" (default), "never", or ' + '"no_color" (use non color terminal sequences). ' + 'Can be used multiple times')) general.add_option( '--compat-options', metavar='OPTS', dest='compat_opts', default=set(), type='str', From 032de83ea9ff2f4977d9c71a93bbc1775597b762 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Wed, 24 May 2023 20:45:15 +0200 Subject: [PATCH 104/501] [extractor/crunchyroll] Rework with support for movies, music and artists (#6237) This adds `CrunchyrollMusicIE` and `CrunchyrollArtistIE` extractors using the new, reworked base class and expands the `CrunchyrollBetaIE` with support for movies and movie listings and more complete metadata extraction Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/crunchyroll.py | 692 +++++++++++++++++++++++--------- 2 files changed, 499 insertions(+), 195 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index fd2bfa9a10..8984d4b167 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -406,6 +406,8 @@ from .crunchyroll import ( CrunchyrollBetaIE, CrunchyrollBetaShowIE, + CrunchyrollMusicIE, + CrunchyrollArtistIE, ) from .cspan import CSpanIE, CSpanCongressIE from .ctsnews import CtsNewsIE diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 1abffcd745..d4a21616ba 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,28 +1,37 @@ import base64 -import urllib.parse +import urllib.error from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, format_field, + int_or_none, join_nonempty, + parse_age_limit, + parse_count, parse_iso8601, qualities, + remove_start, + time_seconds, traverse_obj, - try_get, + url_or_none, + urlencode_postdata, ) class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - params = None + _AUTH_HEADERS = None + _API_ENDPOINT = None + _BASIC_AUTH = None + _QUERY = {} @property def is_logged_in(self): - return self._get_cookies(self._LOGIN_URL).get('etp_rt') + return self._get_cookies(self._BASE_URL).get('etp_rt') def _perform_login(self, username, password): if self.is_logged_in: @@ -35,7 +44,7 @@ def _perform_login(self, username, password): 'device_id': 'whatvalueshouldbeforweb', 'device_type': 'com.crunchyroll.static', 'access_token': 'giKq5eY27ny3cqz', - 'referer': self._LOGIN_URL + 'referer': f'{self._BASE_URL}/welcome/login' }) if upsell_response['code'] != 'ok': raise ExtractorError('Could not get session id') @@ -43,149 +52,89 @@ def _perform_login(self, username, password): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urllib.parse.urlencode({ + data=urlencode_postdata({ 'account': username, 'password': password, 'session_id': session_id - }).encode('ascii')) + })) if login_response['code'] != 'ok': raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) if not self.is_logged_in: raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _get_embedded_json(self, webpage, display_id): - initial_state = self._parse_json(self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) - app_config = self._parse_json(self._search_regex( - r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) - return initial_state, app_config + def _update_query(self, lang): + if lang in CrunchyrollBaseIE._QUERY: + return - def _get_params(self, lang): - if not CrunchyrollBaseIE.params: - if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): - grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' - else: - grant_type, key = 'client_id', 'anonClientId' + webpage = self._download_webpage( + f'{self._BASE_URL}/{lang}', None, note=f'Retrieving main page (lang={lang or None})') - initial_state, app_config = self._get_embedded_json(self._download_webpage( - f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') + initial_state = self._search_json(r'__INITIAL_STATE__\s*=', webpage, 'initial state', None) + CrunchyrollBaseIE._QUERY[lang] = traverse_obj(initial_state, { + 'locale': ('localization', 'locale'), + }) or None - auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={ - 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') - }, data=f'grant_type={grant_type}'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', None, note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - cms = policy_response.get('cms_web') - bucket = cms['bucket'] - params = { - 'Policy': cms['policy'], - 'Signature': cms['signature'], - 'Key-Pair-Id': cms['key_pair_id'] - } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale - CrunchyrollBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBaseIE.params + if CrunchyrollBaseIE._BASIC_AUTH: + return + app_config = self._search_json(r'__APP_CONFIG__\s*=', webpage, 'app config', None) + cx_api_param = app_config['cxApiParams']['accountAuthClientId' if self.is_logged_in else 'anonClientId'] + self.write_debug(f'Using cxApiParam={cx_api_param}') + CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() -class CrunchyrollBetaIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ - (?P<lang>(?:\w{2}(?:-\w{2})?/)?) - watch/(?P<id>\w+) - (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' - _TESTS = [{ - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', - 'info_dict': { - 'id': 'GY2P1Q98Y', - 'ext': 'mp4', - 'duration': 1380.241, - 'timestamp': 1459632600, - 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', - 'title': 'World Trigger Episode 73 – To the Future', - 'upload_date': '20160402', - 'series': 'World Trigger', - 'series_id': 'GR757DMKY', - 'season': 'World Trigger', - 'season_id': 'GR9P39NJ6', - 'season_number': 1, - 'episode': 'To the Future', - 'episode_number': 73, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:2', - }, - 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, - }, { - 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', - 'info_dict': { - 'id': 'GYE5WKQGR', - 'ext': 'mp4', - 'duration': 366.459, - 'timestamp': 1476788400, - 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', - 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', - 'upload_date': '20161018', - 'series': 'SHELTER', - 'series_id': 'GYGG09WWY', - 'season': 'SHELTER', - 'season_id': 'GR09MGK4R', - 'season_number': 1, - 'episode': 'Porter Robinson presents Shelter the Animation', - 'episode_number': 0, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:0', - }, - 'params': {'skip_download': True}, - 'skip': 'Video is Premium only', - }, { - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', - 'only_matching': True, - }, { - 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', - 'only_matching': True, - }] + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): + return - def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + assert CrunchyrollBaseIE._BASIC_AUTH, '_update_query needs to be called at least one time beforehand' + grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' + auth_response = self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', + headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) - episode_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'): - if self.is_logged_in: - raise ExtractorError('This video is for premium members only', expected=True) - else: - self.raise_login_required('This video is for premium members only') + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} + CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) - stream_response = self._download_json( - f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, - note='Retrieving stream info', query=params) - get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): + self._update_query(lang) + self._update_auth() - requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - hardsub_preference = qualities(requested_hardsubs[::-1]) + if not endpoint.startswith('/'): + endpoint = f'/{endpoint}' + + return self._download_json( + f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', + headers=CrunchyrollBaseIE._AUTH_HEADERS, query={**CrunchyrollBaseIE._QUERY[lang], **query}) + + def _call_api(self, path, internal_id, lang, note='api', query={}): + if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): + path = f'/content/v2/{self._API_ENDPOINT}/{path}' + + try: + result = self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) + except ExtractorError as error: + if isinstance(error.cause, urllib.error.HTTPError) and error.cause.code == 404: + return None + raise + + if not result: + raise ExtractorError(f'Unexpected response when downloading {note} JSON') + return result + + def _extract_formats(self, stream_response, display_id=None): requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - available_formats = {} - for stream_type, streams in get_streams('streams'): + for stream_type, streams in traverse_obj( + stream_response, (('streams', ('data', 0)), {dict.items}, ...)): if stream_type not in requested_formats: continue - for stream in streams.values(): - if not stream.get('url'): - continue + for stream in traverse_obj(streams, lambda _, v: v['url']): hardsub_lang = stream.get('hardsub_locale') or '' format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] if '' in available_formats and 'all' not in requested_hardsubs: full_format_langs = set(requested_hardsubs) self.to_screen( @@ -196,6 +145,8 @@ def _real_extract(self, url): else: full_format_langs = set(map(str.lower, available_formats)) + audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + hardsub_preference = qualities(requested_hardsubs[::-1]) formats = [] for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): if stream_type.endswith('hls'): @@ -214,63 +165,292 @@ def _real_extract(self, url): continue for f in adaptive_formats: if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') + f['language'] = audio_locale f['quality'] = hardsub_preference(hardsub_lang.lower()) formats.extend(adaptive_formats) - chapters = None + return formats + + def _extract_subtitles(self, data): + subtitles = {} + + for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): + subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] + + return subtitles + + +class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): + _API_ENDPOINT = 'cms' + _CMS_EXPIRY = None + + def _call_cms_api_signed(self, path, internal_id, lang, note='api'): + if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds(): + response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web'] + CrunchyrollCmsBaseIE._CMS_QUERY = { + 'Policy': response['policy'], + 'Signature': response['signature'], + 'Key-Pair-Id': response['key_pair_id'], + } + CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket'] + CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10 + + if not path.startswith('/cms/v2'): + path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}' + + return self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY) + + +class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'''(?x) + https?://(?:beta\.|www\.)?crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?!concert|musicvideo)(?P<id>\w+)''' + _TESTS = [{ + # Premium only + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'info_dict': { + 'id': 'GY2P1Q98Y', + 'ext': 'mp4', + 'duration': 1380.241, + 'timestamp': 1459632600, + 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', + 'title': 'World Trigger Episode 73 – To the Future', + 'upload_date': '20160402', + 'series': 'World Trigger', + 'series_id': 'GR757DMKY', + 'season': 'World Trigger', + 'season_id': 'GR9P39NJ6', + 'season_number': 1, + 'episode': 'To the Future', + 'episode_number': 73, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'chapters': 'count:2', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, + }, { + # Premium only + 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', + 'info_dict': { + 'id': 'GYE5WKQGR', + 'ext': 'mp4', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard', + 'info_dict': { + 'id': 'GJWU2VKK3', + 'ext': 'mp4', + 'duration': 1420.054, + 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd', + 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard', + 'series': 'The Ice Guy and His Cool Female Colleague', + 'series_id': 'GW4HM75NP', + 'season': 'The Ice Guy and His Cool Female Colleague', + 'season_id': 'GY9PC21VE', + 'season_number': 1, + 'episode': 'Cherry Blossom Meeting and a Coming Blizzard', + 'episode_number': 1, + 'chapters': 'count:2', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'timestamp': 1672839000, + 'upload_date': '20230104', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ', + 'info_dict': { + 'id': 'GM8F313NQ', + 'ext': 'mp4', + 'title': 'Garakowa -Restore the World-', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'duration': 3996.104, + 'age_limit': 13, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', + 'info_dict': { + 'id': 'G62PEZ2E6', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'age_limit': 13, + 'duration': 65.138, + 'title': 'Garakowa -Restore the World-', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'only_matching': True, + }, { + 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', + 'only_matching': True, + }] + # We want to support lazy playlist filtering and movie listings cannot be inside a playlist + _RETURN_TYPE = 'video' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + + # We need to use unsigned API call to allow ratings query string + response = traverse_obj(self._call_api( + f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + object_type = response.get('type') + if object_type == 'episode': + result = self._transform_episode_response(response) + + elif object_type == 'movie': + result = self._transform_movie_response(response) + + elif object_type == 'movie_listing': + first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id')) + if not self._yes_playlist(internal_id, first_movie_id): + return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id) + + def entries(): + movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list') + for movie_response in traverse_obj(movies, ('data', ...)): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}', + CrunchyrollBetaIE, **self._transform_movie_response(movie_response)) + + return self.playlist_result(entries(), **self._transform_movie_response(response)) + + else: + raise ExtractorError(f'Unknown object type {object_type}') + + # There might be multiple audio languages for one object (`<object>_metadata.versions`), + # so we need to get the id from `streams_link` instead or we dont know which language to choose + streams_link = response.get('streams_link') + if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + message = f'This {object_type} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + # We need go from unsigned to signed api to avoid getting soft banned + stream_response = self._call_cms_api_signed(remove_start( + streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + result['subtitles'] = self._extract_subtitles(stream_response) + # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json(f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - display_id, fatal=False, errnote=False) + intro_chapter = self._download_json( + f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) if isinstance(intro_chapter, dict): - chapters = [{ + result['chapters'] = [{ 'title': 'Intro', 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')) + 'end_time': float_or_none(intro_chapter.get('endTime')), }] + def calculate_count(item): + return parse_count(''.join((item['displayed'], item.get('unit') or ''))) + + result.update(traverse_obj(response, ('rating', { + 'like_count': ('up', {calculate_count}), + 'dislike_count': ('down', {calculate_count}), + }))) + + return result + + @staticmethod + def _transform_episode_response(data): + metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {} return { - 'id': internal_id, - 'title': '%s Episode %s – %s' % ( - episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'timestamp': parse_iso8601(episode_response.get('upload_date')), - 'series': episode_response.get('series_title'), - 'series_id': episode_response.get('series_id'), - 'season': episode_response.get('season_title'), - 'season_id': episode_response.get('season_id'), - 'season_number': episode_response.get('season_number'), - 'episode': episode_response.get('title'), - 'episode_number': episode_response.get('sequence_number'), - 'formats': formats, - 'thumbnails': [{ - 'url': thumb.get('source'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], - 'subtitles': { - lang: [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] for lang, subtitle_data in get_streams('subtitles') - }, - 'chapters': chapters + 'id': data['id'], + 'title': ' \u2013 '.join(( + ('%s%s' % ( + format_field(metadata, 'season_title'), + format_field(metadata, 'episode', ' Episode %s'))), + format_field(data, 'title'))), + **traverse_obj(data, { + 'episode': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', ({int}, {float_or_none})), + 'episode_number': ('sequence_number', ({int}, {float_or_none})), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'language': ('audio_locale', {str}), + }, get_all=False), + } + + @staticmethod + def _transform_movie_response(data): + metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {} + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P<lang>(?:\w{2}(?:-\w{2})?/)?) - series/(?P<id>\w+) - (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' + series/(?P<id>\w+)''' _TESTS = [{ 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', + 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750', + # XXX: `thumbnail` does not get set from `thumbnails` in playlist + # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, }, 'playlist_mincount': 10, }, { @@ -279,41 +459,163 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) - - series_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, - note='Retrieving series metadata', query=params) - - seasons_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, - note='Retrieving season list', query=params) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') def entries(): - for season in seasons_response['items']: - episodes_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, - note=f'Retrieving episode list for {season.get("slug_title")}', query=params) - for episode in episodes_response['items']: - episode_id = episode['id'] - episode_display_id = episode['slug_title'] - yield { - '_type': 'url', - 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', - 'ie_key': CrunchyrollBetaIE.ie_key(), - 'id': episode_id, - 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), - 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode.get('duration_ms'), 1000), - 'series': episode.get('series_title'), - 'series_id': episode.get('series_id'), - 'season': episode.get('season_title'), - 'season_id': episode.get('season_id'), - 'season_number': episode.get('season_number'), - 'episode': episode.get('title'), - 'episode_number': episode.get('sequence_number'), - 'language': episode.get('audio_locale'), - } + seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons') + for season in traverse_obj(seasons_response, ('items', ..., {dict})): + episodes_response = self._call_cms_api_signed( + f'episodes?season_id={season["id"]}', season["id"], lang, 'episode list') + for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}', + CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response)) - return self.playlist_result(entries(), internal_id, series_response.get('title')) + return self.playlist_result( + entries(), internal_id, + **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, { + 'title': ('title', {str}), + 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'thumbnails': ('images', ..., ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }) + }))) + + +class CrunchyrollMusicIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:music' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?P<type>concert|musicvideo)/(?P<id>\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV88BB7F2C', + 'display_id': 'crossing-field', + 'title': 'Crossing Field', + 'track': 'Crossing Field', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['Anime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MC2E2AC135', + 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', + 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'description': 'md5:747444e7e6300907b7a43f0a0503072e', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type') + path, name = { + 'concert': ('concerts', 'concert info'), + 'musicvideo': ('music_videos', 'music video info'), + }[object_type] + response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + streams_link = response.get('streams_link') + if not streams_link and response.get('isPremiumOnly'): + message = f'This {response.get("type") or "media"} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + result = self._transform_music_response(response) + stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + + return result + + @staticmethod + def _transform_music_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'display_id': 'slug', + 'title': 'title', + 'track': 'title', + 'artist': ('artist', 'name'), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } + + +class CrunchyrollArtistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:artist' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + artist/(?P<id>\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D', + 'info_dict': { + 'id': 'MA179CB50D', + 'title': 'LiSA', + 'genre': ['J-Pop', 'Anime', 'Rock'], + 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', + }, + 'playlist_mincount': 83, + }, { + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + response = traverse_obj(self._call_api( + f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0)) + + def entries(): + for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]: + for internal_id in traverse_obj(response, (attribute, ...)): + yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id) + + return self.playlist_result(entries(), **self._transform_artist_response(response)) + + @staticmethod + def _transform_artist_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': 'name', + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + }), + } From edbe5b589dd0860a67b4e03f58db3cd2539d91c2 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Thu, 25 May 2023 22:52:44 +0200 Subject: [PATCH 105/501] Bugfixes for 4823ec9f461512daa1b8ab362893bb86a6320b26 Hotfix for fragmented downloads Authored by: bashonly --- yt_dlp/downloader/fragment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 6770815abb..53b4b604e7 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -284,7 +284,7 @@ def frag_progress_hook(s): frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) if not ctx['live']: state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes @@ -304,7 +304,7 @@ def _finish_frag_download(self, ctx, info_dict): to_file = ctx['tmpfilename'] != '-' if to_file: - downloaded_bytes = self.filesize_or_none(ctx['filename']) + downloaded_bytes = self.filesize_or_none(ctx['tmpfilename']) else: downloaded_bytes = ctx['complete_frags_downloaded_bytes'] From 4ad58667c102bd82a7c4cca8aa395ec1682e3b4c Mon Sep 17 00:00:00 2001 From: MMM <flashdagger@googlemail.com> Date: Thu, 25 May 2023 23:06:58 +0200 Subject: [PATCH 106/501] [extractor/bibeltv] Fix extraction, support live streams and series (#6505) Authored by: flashdagger --- yt_dlp/extractor/_extractors.py | 6 +- yt_dlp/extractor/bibeltv.py | 208 +++++++++++++++++++++++++++++--- 2 files changed, 194 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8984d4b167..6a1406dc5c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -204,7 +204,11 @@ BFMTVLiveIE, BFMTVArticleIE, ) -from .bibeltv import BibelTVIE +from .bibeltv import ( + BibelTVLiveIE, + BibelTVSeriesIE, + BibelTVVideoIE, +) from .bigflix import BigflixIE from .bigo import BigoIE from .bild import BildIE diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index fd20aadad4..34464daa1a 100644 --- a/yt_dlp/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py @@ -1,27 +1,197 @@ +from functools import partial + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + js_to_json, + orderedSet, + parse_iso8601, + traverse_obj, + url_or_none, +) -class BibelTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', - 'md5': '252f908192d611de038b8504b08bf97f', - 'info_dict': { - 'id': 'ref:329703', - 'ext': 'mp4', - 'title': 'Sprachkurs in Malaiisch', - 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', - 'timestamp': 1608316701, - 'uploader_id': '5840105145001', - 'upload_date': '20201218', +class BibelTVBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['AT', 'CH', 'DE'] + _GEO_BYPASS = False + + API_URL = 'https://www.bibeltv.de/mediathek/api' + AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm' + + def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False): + formats = [] + subtitles = {} + for media_url in traverse_obj(data, (..., 'src', {url_or_none})): + media_ext = determine_ext(media_url) + if media_ext == 'm3u8': + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + media_url, crn_id, live=is_live) + formats.extend(m3u8_formats) + subtitles.update(m3u8_subs) + elif media_ext == 'mpd': + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id) + formats.extend(mpd_formats) + subtitles.update(mpd_subs) + elif media_ext == 'mp4': + formats.append({'url': media_url}) + else: + self.report_warning(f'Unknown format {media_ext!r}') + + return formats, subtitles + + @staticmethod + def _extract_base_info(data): + return { + 'id': data['crn'], + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'timestamp': ('schedulingStart', {parse_iso8601}), + 'season_number': 'seasonNumber', + 'episode_number': 'episodeNumber', + 'view_count': 'viewCount', + 'like_count': 'likeCount', + }), + 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., { + 'url': ('url', {url_or_none}), + }))), } - }, { - 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', - 'only_matching': True, + + def _extract_url_info(self, data): + return { + '_type': 'url', + 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'), + **self._extract_base_info(data), + } + + def _extract_video_info(self, data): + crn_id = data['crn'] + + if data.get('drm'): + self.report_drm(crn_id) + + json_data = self._download_json( + format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id, + headers={'Authorization': self.AUTH_TOKEN}, fatal=False, + errnote='No formats available') or {} + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id) + + return { + '_type': 'video', + **self._extract_base_info(data), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BibelTVVideoIE(BibelTVBaseIE): + IE_DESC = 'BibelTV single video' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P<id>\d+)[\w-]+' + IE_NAME = 'bibeltv:video' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege', + 'md5': 'ec1c07efe54353780512e8a4103b612e', + 'info_dict': { + 'id': '344436', + 'ext': 'mp4', + 'title': 'Alte Wege', + 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9', + 'timestamp': 1677877071, + 'duration': 150.0, + 'upload_date': '20230303', + 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg', + 'episode': 'Episode 1', + 'episode_number': 1, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'format': '6', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): crn_id = self._match_id(url) - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') + video_data = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id), + ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict})) + if not video_data: + raise ExtractorError('Missing video data.') + + return self._extract_video_info(video_data) + + +class BibelTVSeriesIE(BibelTVBaseIE): + IE_DESC = 'BibelTV series playlist' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P<id>\d+)[\w-]+' + IE_NAME = 'bibeltv:series' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag', + 'playlist_mincount': 400, + 'info_dict': { + 'id': '333485', + 'title': 'Ein Wunder für jeden Tag', + 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + webpage = self._download_webpage(url, crn_id) + nextjs_data = self._search_nextjs_data(webpage, crn_id) + series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict})) + if not series_data: + raise ExtractorError('Missing series data.') + + return self.playlist_result( + traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})), + crn_id, series_data.get('title'), clean_html(series_data.get('description'))) + + +class BibelTVLiveIE(BibelTVBaseIE): + IE_DESC = 'BibelTV live program' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P<id>[\w-]+)' + IE_NAME = 'bibeltv:live' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/livestreams/bibeltv/', + 'info_dict': { + 'id': 'bibeltv', + 'ext': 'mp4', + 'title': 're:Bibel TV', + 'live_status': 'is_live', + 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bibeltv.de/livestreams/impuls/', + 'only_matching': True, + }] + + def _real_extract(self, url): + stream_id = self._match_id(url) + webpage = self._download_webpage(url, stream_id) + stream_data = self._search_json( + r'\\"video\\":', webpage, 'bibeltvData', stream_id, + transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"'))) + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True) + + return { + 'id': stream_id, + 'title': stream_data.get('title'), + 'thumbnail': stream_data.get('poster'), + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + } From 5caf30dbc34f10b0be60676fece635b5c59f0d72 Mon Sep 17 00:00:00 2001 From: Audrey <45548254+tntmod54321@users.noreply.github.com> Date: Fri, 26 May 2023 08:24:39 -0400 Subject: [PATCH 107/501] [extractor/youtube] Extract `heatmap` data (#7100) Closes #3888 Authored by: tntmod54321 --- yt_dlp/extractor/common.py | 4 ++++ yt_dlp/extractor/youtube.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 78288f8091..1b1dd560fd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -350,6 +350,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 654bf5e6b6..80edcd77da 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1273,6 +1273,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', } }, { @@ -1426,6 +1427,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'FlyingKitty', 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', 'uploader_id': '@FlyingKitty900', + 'comment_count': int, }, }, { @@ -3244,6 +3246,17 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) + def _extract_heatmap_from_player_overlay(self, data): + content_list = traverse_obj(data, ( + 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', + 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) + return next(filter(None, ( + traverse_obj(contents, (..., 'heatMarkerRenderer', { + 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, + 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), + })) for contents in content_list)), None) + def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: @@ -4313,6 +4326,8 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) + info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), expected_type=list, default=[]) From b844a3f8b16500663e7ab6c6ec061cc9b30f71ac Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 26 May 2023 07:57:10 -0500 Subject: [PATCH 108/501] [extractor/weverse] Add extractors (#6711) Closes #4786 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 8 + yt_dlp/extractor/naver.py | 2 +- yt_dlp/extractor/weverse.py | 604 ++++++++++++++++++++++++++++++++ 3 files changed, 613 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/weverse.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6a1406dc5c..49dd9aecd5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2320,6 +2320,14 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .weverse import ( + WeverseIE, + WeverseMediaIE, + WeverseMomentIE, + WeverseLiveTabIE, + WeverseMediaTabIE, + WeverseLiveIE, +) from .wevidi import WeVidiIE from .whyp import WhypIE from .wikimedia import WikimediaIE diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 7a1890a618..d79caf5f3d 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -21,7 +21,7 @@ class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE + @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)): diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py new file mode 100644 index 0000000000..ab629c885c --- /dev/null +++ b/yt_dlp/extractor/weverse.py @@ -0,0 +1,604 @@ +import base64 +import hashlib +import hmac +import itertools +import json +import re +import time +import urllib.error +import urllib.parse +import uuid + +from .common import InfoExtractor +from .naver import NaverBaseIE +from .youtube import YoutubeIE +from ..utils import ( + ExtractorError, + UserNotLive, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, + url_or_none, +) + + +class WeverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'weverse' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _API_HEADERS = { + 'Referer': 'https://weverse.io/', + 'WEV-device-Id': str(uuid.uuid4()), + } + + def _perform_login(self, username, password): + headers = { + 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', + 'x-acc-app-version': '2.2.6', + 'x-acc-language': 'en', + 'x-acc-service-id': 'weverse', + 'x-acc-trace-id': str(uuid.uuid4()), + 'x-clog-user-device-id': str(uuid.uuid4()), + } + check_username = self._download_json( + f'{self._ACCOUNT_API_BASE}/signup/email/status', None, + note='Checking username', query={'email': username}, headers=headers) + if not check_username.get('hasPassword'): + raise ExtractorError('Invalid username provided', expected=True) + + headers['content-type'] = 'application/json' + try: + auth = self._download_json( + f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + 'email': username, + 'password': password, + }, separators=(',', ':')).encode(), headers=headers, note='Logging in') + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + raise ExtractorError('Invalid password provided', expected=True) + raise + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {auth["accessToken"]}' + + def _real_initialize(self): + if self._API_HEADERS.get('Authorization'): + return + + token = try_call(lambda: self._get_cookies('https://weverse.io/')['we2_access_token'].value) + if not token: + self.raise_login_required() + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}' + + def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): + # Ref: https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/2488.a09b41ff.chunk.js + # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: + key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' + api_path = update_url_query(ep, { + 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', + 'language': 'en', + 'platform': 'WEB', + 'wpf': 'pc', + }) + wmsgpad = int(time.time() * 1000) + wmd = base64.b64encode(hmac.HMAC( + key, f'{api_path[:255]}{wmsgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + headers = {'Content-Type': 'application/json'} if data else {} + try: + return self._download_json( + f'https://global.apis.naver.com/weverse/wevweb{api_path}', video_id, note=note, + data=data, headers={**self._API_HEADERS, **headers}, query={ + 'wmsgpad': wmsgpad, + 'wmd': wmd, + }) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + self.raise_login_required( + 'Session token has expired. Log in again or refresh cookies in browser') + elif isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + raise ExtractorError('Your account does not have access to this content', expected=True) + raise + + def _call_post_api(self, video_id): + return self._call_api(f'/post/v1.0/post-{video_id}?fieldSet=postV1', video_id) + + def _get_community_id(self, channel): + return str(self._call_api( + f'/community/v1.0/communityIdUrlPathByUrlPathArtistCode?keyword={channel}', + channel, note='Fetching community ID')['communityId']) + + def _get_formats(self, data, video_id): + formats = traverse_obj(data, ('videos', 'list', lambda _, v: url_or_none(v['source']), { + 'url': 'source', + 'width': ('encodingOption', 'width', {int_or_none}), + 'height': ('encodingOption', 'height', {int_or_none}), + 'vcodec': 'type', + 'vbr': ('bitrate', 'video', {int_or_none}), + 'abr': ('bitrate', 'audio', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('encodingOption', 'id', {str_or_none}), + })) + + for stream in traverse_obj(data, ('streams', lambda _, v: v['type'] == 'HLS' and url_or_none(v['source']))): + query = {} + for param in traverse_obj(stream, ('keys', lambda _, v: v['type'] == 'param' and v['name'])): + query[param['name']] = param.get('value', '') + fmts = self._extract_m3u8_formats( + stream['source'], video_id, 'mp4', m3u8_id='hls', fatal=False, query=query) + if query: + for fmt in fmts: + fmt['url'] = update_url_query(fmt['url'], query) + fmt['extra_param_to_segment_url'] = urllib.parse.urlencode(query) + formats.extend(fmts) + + return formats + + def _get_subs(self, caption_url): + subs_ext_re = r'\.(?:ttml|vtt)' + replace_ext = lambda x, y: re.sub(subs_ext_re, y, x) + if re.search(subs_ext_re, caption_url): + return [replace_ext(caption_url, '.ttml'), replace_ext(caption_url, '.vtt')] + return [caption_url] + + def _parse_post_meta(self, metadata): + return traverse_obj(metadata, { + 'title': ((('extension', 'mediaInfo', 'title'), 'title'), {str}), + 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'uploader_id': ('author', 'memberId', {str}), + 'creator': ('community', 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'video', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'release_timestamp': ('extension', 'video', 'onAirStartAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', (('mediaInfo', 'thumbnail', 'url'), ('video', 'thumb')), {url_or_none}), + 'view_count': ('extension', 'video', 'playCount', {int_or_none}), + 'like_count': ('extension', 'video', 'likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False) + + def _extract_availability(self, data): + return self._availability(**traverse_obj(data, ((('extension', 'video'), None), { + 'needs_premium': 'paid', + 'needs_subscription': 'membershipOnly', + }), get_all=False, expected_type=bool), needs_auth=True) + + def _extract_live_status(self, data): + data = traverse_obj(data, ('extension', 'video', {dict})) or {} + if data.get('type') == 'LIVE': + return traverse_obj({ + 'ONAIR': 'is_live', + 'DONE': 'post_live', + 'STANDBY': 'is_upcoming', + 'DELAY': 'is_upcoming', + }, (data.get('status'), {str})) or 'is_live' + return 'was_live' if data.get('liveToVod') else 'not_live' + + +class WeverseIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/live/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/0-107323480', + 'md5': '1fa849f00181eef9100d3c8254c47979', + 'info_dict': { + 'id': '0-107323480', + 'ext': 'mp4', + 'title': '행복한 평이루💜', + 'description': '', + 'uploader': 'Billlie', + 'uploader_id': '5ae14aed7b7cdc65fa87c41fe06cc936', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1666262062, + 'upload_date': '20221020', + 'release_timestamp': 1666262058, + 'release_date': '20221020', + 'duration': 3102, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://weverse.io/lesserafim/live/2-102331763', + 'md5': 'e46125c08b13a6c8c1f4565035cca987', + 'info_dict': { + 'id': '2-102331763', + 'ext': 'mp4', + 'title': '🎂김채원 생신🎂', + 'description': '🎂김채원 생신🎂', + 'uploader': 'LE SSERAFIM ', + 'uploader_id': 'd26ddc1e258488a0a2b795218d14d59d', + 'channel': 'lesserafim', + 'channel_id': '47', + 'channel_url': 'https://weverse.io/lesserafim', + 'creator': 'LE SSERAFIM', + 'timestamp': 1659353400, + 'upload_date': '20220801', + 'release_timestamp': 1659353400, + 'release_date': '20220801', + 'duration': 3006, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + 'subtitles': { + 'id_ID': 'count:2', + 'en_US': 'count:2', + 'es_ES': 'count:2', + 'vi_VN': 'count:2', + 'th_TH': 'count:2', + 'zh_CN': 'count:2', + 'zh_TW': 'count:2', + 'ja_JP': 'count:2', + 'ko_KR': 'count:2', + }, + }, + }, { + 'url': 'https://weverse.io/treasure/live/2-117230416', + 'info_dict': { + 'id': '2-117230416', + 'ext': 'mp4', + 'title': r're:스껄도려님 첫 스무살 생파🦋', + 'description': '', + 'uploader': 'TREASURE', + 'uploader_id': '77eabbc449ca37f7970054a136f60082', + 'channel': 'treasure', + 'channel_id': '20', + 'channel_url': 'https://weverse.io/treasure', + 'creator': 'TREASURE', + 'timestamp': 1680667651, + 'upload_date': '20230405', + 'release_timestamp': 1680667639, + 'release_date': '20230405', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['video']['videoId'] + availability = self._extract_availability(post) + live_status = self._extract_live_status(post) + video_info, formats = {}, [] + + if live_status == 'is_upcoming': + self.raise_no_formats('Livestream has not yet started', expected=True) + + elif live_status == 'is_live': + video_info = self._call_api( + f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + video_id, note='Downloading live JSON') + playback = self._parse_json(video_info['lipPlayback'], video_id) + m3u8_url = traverse_obj(playback, ( + 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + + elif live_status == 'post_live': + if availability in ('premium_only', 'subscriber_only'): + self.report_drm(video_id) + self.raise_no_formats( + 'Livestream has ended and downloadable VOD is not available', expected=True) + + else: + infra_video_id = post['extension']['video']['infraVideoId'] + in_key = self._call_api( + f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + data=b'{}', note='Downloading VOD API key')['inKey'] + + video_info = self._download_json( + f'https://global.apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{infra_video_id}', + video_id, note='Downloading VOD JSON', query={ + 'key': in_key, + 'sid': traverse_obj(post, ('extension', 'video', 'serviceId')) or '2070', + 'pid': str(uuid.uuid4()), + 'nonce': int(time.time() * 1000), + 'devt': 'html5_pc', + 'prv': 'Y' if post.get('membershipOnly') else 'N', + 'aup': 'N', + 'stpb': 'N', + 'cpl': 'en', + 'env': 'prod', + 'lc': 'en', + 'adi': '[{"adSystem":"null"}]', + 'adu': '/', + }) + + formats = self._get_formats(video_info, video_id) + has_drm = traverse_obj(video_info, ('meta', 'provider', 'name', {str.lower})) == 'drm' + if has_drm and formats: + self.report_warning( + 'Requested content is DRM-protected, only a 30-second preview is available', video_id) + elif has_drm and not formats: + self.report_drm(video_id) + + return { + 'id': video_id, + 'channel': channel, + 'channel_url': f'https://weverse.io/{channel}', + 'formats': formats, + 'availability': availability, + 'live_status': live_status, + **self._parse_post_meta(post), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseMediaIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/media/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/4-116372884', + 'md5': '8efc9cfd61b2f25209eb1a5326314d28', + 'info_dict': { + 'id': 'e-C9wLSQs6o', + 'ext': 'mp4', + 'title': 'Billlie | \'EUNOIA\' Performance Video (heartbeat ver.)', + 'description': 'md5:6181caaf2a2397bca913ffe368c104e5', + 'channel': 'Billlie', + 'channel_id': 'UCyc9sUCxELTDK9vELO5Fzeg', + 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', + 'uploader': 'Billlie', + 'uploader_id': '@Billlie', + 'uploader_url': 'http://www.youtube.com/@Billlie', + 'upload_date': '20230403', + 'duration': 211, + 'age_limit': 0, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'availability': 'public', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': 'count:7', + }, + }, { + 'url': 'https://weverse.io/billlie/media/3-102914520', + 'md5': '031551fcbd716bc4f080cb6174a43d8a', + 'info_dict': { + 'id': '3-102914520', + 'ext': 'mp4', + 'title': 'From. SUHYEON🌸', + 'description': 'Billlie 멤버별 독점 영상 공개💙💜', + 'uploader': 'Billlie_official', + 'uploader_id': 'f569c6e92f7eaffef0a395037dcaa54f', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1662174000, + 'upload_date': '20220903', + 'release_timestamp': 1662174000, + 'release_date': '20220903', + 'duration': 17.0, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'not_live', + }, + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + media_type = traverse_obj(post, ('extension', 'mediaInfo', 'mediaType', {str.lower})) + youtube_id = traverse_obj(post, ('extension', 'youtube', 'youtubeVideoId', {str})) + + if media_type == 'vod': + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) + elif media_type == 'youtube' and youtube_id: + return self.url_result(youtube_id, YoutubeIE) + elif media_type == 'image': + self.raise_no_formats('No video content found in webpage', expected=True) + elif media_type: + raise ExtractorError(f'Unsupported media type "{media_type}"') + + self.raise_no_formats('No video content found in webpage') + + +class WeverseMomentIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/moment/(?P<uid>[\da-f]+)/post/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', + 'md5': '87733ac19a54081b7dfc2442036d282b', + 'info_dict': { + 'id': '1-117229444', + 'ext': 'mp4', + 'title': '今日もめっちゃいい天気☀️🌤️', + 'uploader': '레아', + 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', + 'channel': 'secretnumber', + 'channel_id': '56', + 'creator': 'SECRET NUMBER', + 'duration': 10, + 'upload_date': '20230405', + 'timestamp': 1680653968, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + }, + 'skip': 'Moment has expired', + }] + + def _real_extract(self, url): + channel, uploader_id, video_id = self._match_valid_url(url).group('artist', 'uid', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['moment']['video']['videoId'] + video_info = self._call_api( + f'/cvideo/v1.0/cvideo-{api_video_id}/playInfo?videoId={api_video_id}', video_id, + note='Downloading moment JSON')['playInfo'] + + return { + 'id': video_id, + 'channel': channel, + 'uploader_id': uploader_id, + 'formats': self._get_formats(video_info, video_id), + 'availability': self._extract_availability(post), + **traverse_obj(post, { + 'title': ((('extension', 'moment', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'creator': (('community', 'author'), 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'moment', 'video', 'uploadInfo', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', 'moment', 'video', 'uploadInfo', 'imageUrl', {url_or_none}), + 'like_count': ('emotionCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseTabBaseIE(WeverseBaseIE): + _ENDPOINT = None + _PATH = None + _QUERY = {} + _RESULT_IE = None + + def _entries(self, channel_id, channel, first_page): + query = self._QUERY.copy() + + for page in itertools.count(1): + posts = first_page if page == 1 else self._call_api( + update_url_query(self._ENDPOINT % channel_id, query), channel, + note=f'Downloading {self._PATH} tab page {page}') + + for post in traverse_obj(posts, ('data', lambda _, v: v['postId'])): + yield self.url_result( + f'https://weverse.io/{channel}/{self._PATH}/{post["postId"]}', + self._RESULT_IE, post['postId'], **self._parse_post_meta(post), + channel=channel, channel_url=f'https://weverse.io/{channel}', + availability=self._extract_availability(post), + live_status=self._extract_live_status(post)) + + query['after'] = traverse_obj(posts, ('paging', 'nextParams', 'after', {str})) + if not query['after']: + break + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + first_page = self._call_api( + update_url_query(self._ENDPOINT % channel_id, self._QUERY), channel, + note=f'Downloading {self._PATH} tab page 1') + + return self.playlist_result( + self._entries(channel_id, channel, first_page), f'{channel}-{self._PATH}', + **traverse_obj(first_page, ('data', ..., { + 'playlist_title': ('community', 'communityName', {str}), + 'thumbnail': ('author', 'profileImageUrl', {url_or_none}), + }), get_all=False)) + + +class WeverseLiveTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/', + 'playlist_mincount': 55, + 'info_dict': { + 'id': 'billlie-live', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }] + + _ENDPOINT = '/post/v1.0/community-%s/liveTabPosts' + _PATH = 'live' + _QUERY = {'fieldSet': 'postsV1'} + _RESULT_IE = WeverseIE + + +class WeverseMediaTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/', + 'playlist_mincount': 231, + 'info_dict': { + 'id': 'billlie-media', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }, { + 'url': 'https://weverse.io/lesserafim/media/all', + 'only_matching': True, + }, { + 'url': 'https://weverse.io/lesserafim/media/new', + 'only_matching': True, + }] + + _ENDPOINT = '/media/v1.0/community-%s/more' + _PATH = 'media' + _QUERY = {'fieldSet': 'postsV1', 'filterType': 'RECENT'} + _RESULT_IE = WeverseMediaIE + + +class WeverseLiveIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/purplekiss', + 'info_dict': { + 'id': '3-116560493', + 'ext': 'mp4', + 'title': r're:모하냥🫶🏻', + 'description': '내일은 금요일~><', + 'uploader': '채인', + 'uploader_id': '1ffb1d9d904d6b3db2783f876eb9229d', + 'channel': 'purplekiss', + 'channel_id': '35', + 'channel_url': 'https://weverse.io/purplekiss', + 'creator': 'PURPLE KISS', + 'timestamp': 1680780892, + 'upload_date': '20230406', + 'release_timestamp': 1680780883, + 'release_date': '20230406', + 'thumbnail': 'https://weverse-live.pstatic.net/v1.0/live/62044/thumb', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/billlie/', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + video_id = traverse_obj( + self._call_api(update_url_query(f'/post/v1.0/community-{channel_id}/liveTab', { + 'debugMessage': 'true', + 'fields': 'onAirLivePosts.fieldSet(postsV1).limit(10),reservedLivePosts.fieldSet(postsV1).limit(10)', + }), channel, note='Downloading live JSON'), ( + ('onAirLivePosts', 'reservedLivePosts'), 'data', + lambda _, v: self._extract_live_status(v) in ('is_live', 'is_upcoming'), 'postId', {str}), + get_all=False) + + if not video_id: + raise UserNotLive(video_id=channel) + + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) From 66468bbf49562ff82670cbbd456c5e8448a6df34 Mon Sep 17 00:00:00 2001 From: sqrtNOT <77981959+sqrtNOT@users.noreply.github.com> Date: Fri, 26 May 2023 13:03:19 +0000 Subject: [PATCH 109/501] [extractor/comedycentral] Add support for movies (#7108) Closes #1926 Authored by: sqrtNOT --- yt_dlp/extractor/comedycentral.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/comedycentral.py b/yt_dlp/extractor/comedycentral.py index 05fc9f2b50..27d295bb38 100644 --- a/yt_dlp/extractor/comedycentral.py +++ b/yt_dlp/extractor/comedycentral.py @@ -2,7 +2,7 @@ class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P<id>[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas', + 'only_matching': True, }] From 08916a49c777cb6e000eec092881eb93ec22076c Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 27 May 2023 19:06:13 +1200 Subject: [PATCH 110/501] [core] Improve HTTP redirect handling (#7094) Aligns HTTP redirect handling with what browsers commonly do and RFC standards. Fixes issues https://github.com/yt-dlp/yt-dlp/commit/afac4caa7db30804bebac33e53c3cb0237958224 missed. Authored by: coletdjnz --- test/test_YoutubeDL.py | 6 - test/test_http.py | 288 +++++++++++++++++++++++++++++++++++++---- yt_dlp/utils/_utils.py | 59 +++------ 3 files changed, 281 insertions(+), 72 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 477fd220ef..ee6c527135 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -10,7 +10,6 @@ import copy import json -import urllib.error from test.helper import FakeYDL, assertRegexpMatches from yt_dlp import YoutubeDL @@ -1097,11 +1096,6 @@ def test_selection(params, expected_ids, evaluate_all=False): test_selection({'playlist_items': '-15::2'}, INDICES[1::2], True) test_selection({'playlist_items': '-15::15'}, [], True) - def test_urlopen_no_file_protocol(self): - # see https://github.com/ytdl-org/youtube-dl/issues/8227 - ydl = YDL() - self.assertRaises(urllib.error.URLError, ydl.urlopen, 'file:///etc/passwd') - def test_do_not_override_ie_key_in_url_transparent(self): ydl = YDL() diff --git a/test/test_http.py b/test/test_http.py index 5ca0d7a470..d684905da5 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -7,40 +7,163 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - +import gzip +import http.cookiejar import http.server +import io +import pathlib import ssl +import tempfile import threading +import urllib.error import urllib.request from test.helper import http_server_port from yt_dlp import YoutubeDL +from yt_dlp.utils import sanitized_Request, urlencode_postdata + +from .helper import FakeYDL TEST_DIR = os.path.dirname(os.path.abspath(__file__)) class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + def log_message(self, format, *args): pass + def _headers(self): + payload = str(self.headers).encode('utf-8') + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def _redirect(self): + self.send_response(int(self.path[len('/redirect_'):])) + self.send_header('Location', '/method') + self.send_header('Content-Length', '0') + self.end_headers() + + def _method(self, method, payload=None): + self.send_response(200) + self.send_header('Content-Length', str(len(payload or ''))) + self.send_header('Method', method) + self.end_headers() + if payload: + self.wfile.write(payload) + + def _status(self, status): + payload = f'<html>{status} NOT FOUND</html>'.encode() + self.send_response(int(status)) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def _read_data(self): + if 'Content-Length' in self.headers: + return self.rfile.read(int(self.headers['Content-Length'])) + + def do_POST(self): + data = self._read_data() + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('POST', data) + elif self.path.startswith('/headers'): + self._headers() + else: + self._status(404) + + def do_HEAD(self): + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('HEAD') + else: + self._status(404) + + def do_PUT(self): + data = self._read_data() + if self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('PUT', data) + else: + self._status(404) + def do_GET(self): if self.path == '/video.html': + payload = b'<html><video src="/vid.mp4" /></html>' self.send_response(200) self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) # required for persistent connections self.end_headers() - self.wfile.write(b'<html><video src="/vid.mp4" /></html>') + self.wfile.write(payload) elif self.path == '/vid.mp4': + payload = b'\x00\x00\x00\x00\x20\x66\x74[video]' self.send_response(200) self.send_header('Content-Type', 'video/mp4') + self.send_header('Content-Length', str(len(payload))) self.end_headers() - self.wfile.write(b'\x00\x00\x00\x00\x20\x66\x74[video]') + self.wfile.write(payload) elif self.path == '/%E4%B8%AD%E6%96%87.html': + payload = b'<html><video src="/vid.mp4" /></html>' self.send_response(200) self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + elif self.path == '/%c7%9f': + payload = b'<html><video src="/vid.mp4" /></html>' + self.send_response(200) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + elif self.path.startswith('/redirect_'): + self._redirect() + elif self.path.startswith('/method'): + self._method('GET') + elif self.path.startswith('/headers'): + self._headers() + elif self.path == '/trailing_garbage': + payload = b'<html><video src="/vid.mp4" /></html>' + self.send_response(200) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Encoding', 'gzip') + buf = io.BytesIO() + with gzip.GzipFile(fileobj=buf, mode='wb') as f: + f.write(payload) + compressed = buf.getvalue() + b'trailing garbage' + self.send_header('Content-Length', str(len(compressed))) + self.end_headers() + self.wfile.write(compressed) + elif self.path == '/302-non-ascii-redirect': + new_url = f'http://127.0.0.1:{http_server_port(self.server)}/中文.html' + self.send_response(301) + self.send_header('Location', new_url) + self.send_header('Content-Length', '0') self.end_headers() - self.wfile.write(b'<html><video src="/vid.mp4" /></html>') else: - assert False + self._status(404) + + def send_header(self, keyword, value): + """ + Forcibly allow HTTP server to send non percent-encoded non-ASCII characters in headers. + This is against what is defined in RFC 3986, however we need to test we support this + since some sites incorrectly do this. + """ + if keyword.lower() == 'connection': + return super().send_header(keyword, value) + + if not hasattr(self, '_headers_buffer'): + self._headers_buffer = [] + + self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode()) class FakeLogger: @@ -56,36 +179,128 @@ def error(self, msg): class TestHTTP(unittest.TestCase): def setUp(self): - self.httpd = http.server.HTTPServer( + # HTTP server + self.http_httpd = http.server.ThreadingHTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) - self.port = http_server_port(self.httpd) - self.server_thread = threading.Thread(target=self.httpd.serve_forever) - self.server_thread.daemon = True - self.server_thread.start() + self.http_port = http_server_port(self.http_httpd) + self.http_server_thread = threading.Thread(target=self.http_httpd.serve_forever) + # FIXME: we should probably stop the http server thread after each test + # See: https://github.com/yt-dlp/yt-dlp/pull/7094#discussion_r1199746041 + self.http_server_thread.daemon = True + self.http_server_thread.start() - -class TestHTTPS(unittest.TestCase): - def setUp(self): + # HTTPS server certfn = os.path.join(TEST_DIR, 'testcert.pem') - self.httpd = http.server.HTTPServer( + self.https_httpd = http.server.ThreadingHTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.load_cert_chain(certfn, None) - self.httpd.socket = sslctx.wrap_socket(self.httpd.socket, server_side=True) - self.port = http_server_port(self.httpd) - self.server_thread = threading.Thread(target=self.httpd.serve_forever) - self.server_thread.daemon = True - self.server_thread.start() + self.https_httpd.socket = sslctx.wrap_socket(self.https_httpd.socket, server_side=True) + self.https_port = http_server_port(self.https_httpd) + self.https_server_thread = threading.Thread(target=self.https_httpd.serve_forever) + self.https_server_thread.daemon = True + self.https_server_thread.start() def test_nocheckcertificate(self): - ydl = YoutubeDL({'logger': FakeLogger()}) - self.assertRaises( - Exception, - ydl.extract_info, 'https://127.0.0.1:%d/video.html' % self.port) + with FakeYDL({'logger': FakeLogger()}) as ydl: + with self.assertRaises(urllib.error.URLError): + ydl.urlopen(sanitized_Request(f'https://127.0.0.1:{self.https_port}/headers')) - ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) - r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) - self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) + with FakeYDL({'logger': FakeLogger(), 'nocheckcertificate': True}) as ydl: + r = ydl.urlopen(sanitized_Request(f'https://127.0.0.1:{self.https_port}/headers')) + self.assertEqual(r.status, 200) + r.close() + + def test_percent_encode(self): + with FakeYDL() as ydl: + # Unicode characters should be encoded with uppercase percent-encoding + res = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/中文.html')) + self.assertEqual(res.status, 200) + res.close() + # don't normalize existing percent encodings + res = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/%c7%9f')) + self.assertEqual(res.status, 200) + res.close() + + def test_unicode_path_redirection(self): + with FakeYDL() as ydl: + r = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) + self.assertEqual(r.url, f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html') + r.close() + + def test_redirect(self): + with FakeYDL() as ydl: + def do_req(redirect_status, method): + data = b'testdata' if method in ('POST', 'PUT') else None + res = ydl.urlopen(sanitized_Request( + f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data)) + return res.read().decode('utf-8'), res.headers.get('method', '') + + # A 303 must either use GET or HEAD for subsequent request + self.assertEqual(do_req(303, 'POST'), ('', 'GET')) + self.assertEqual(do_req(303, 'HEAD'), ('', 'HEAD')) + + self.assertEqual(do_req(303, 'PUT'), ('', 'GET')) + + # 301 and 302 turn POST only into a GET + self.assertEqual(do_req(301, 'POST'), ('', 'GET')) + self.assertEqual(do_req(301, 'HEAD'), ('', 'HEAD')) + self.assertEqual(do_req(302, 'POST'), ('', 'GET')) + self.assertEqual(do_req(302, 'HEAD'), ('', 'HEAD')) + + self.assertEqual(do_req(301, 'PUT'), ('testdata', 'PUT')) + self.assertEqual(do_req(302, 'PUT'), ('testdata', 'PUT')) + + # 307 and 308 should not change method + for m in ('POST', 'PUT'): + self.assertEqual(do_req(307, m), ('testdata', m)) + self.assertEqual(do_req(308, m), ('testdata', m)) + + self.assertEqual(do_req(307, 'HEAD'), ('', 'HEAD')) + self.assertEqual(do_req(308, 'HEAD'), ('', 'HEAD')) + + # These should not redirect and instead raise an HTTPError + for code in (300, 304, 305, 306): + with self.assertRaises(urllib.error.HTTPError): + do_req(code, 'GET') + + def test_content_type(self): + # https://github.com/yt-dlp/yt-dlp/commit/379a4f161d4ad3e40932dcf5aca6e6fb9715ab28 + with FakeYDL({'nocheckcertificate': True}) as ydl: + # method should be auto-detected as POST + r = sanitized_Request(f'https://localhost:{self.https_port}/headers', data=urlencode_postdata({'test': 'test'})) + + headers = ydl.urlopen(r).read().decode('utf-8') + self.assertIn('Content-Type: application/x-www-form-urlencoded', headers) + + # test http + r = sanitized_Request(f'http://localhost:{self.http_port}/headers', data=urlencode_postdata({'test': 'test'})) + headers = ydl.urlopen(r).read().decode('utf-8') + self.assertIn('Content-Type: application/x-www-form-urlencoded', headers) + + def test_cookiejar(self): + with FakeYDL() as ydl: + ydl.cookiejar.set_cookie(http.cookiejar.Cookie( + 0, 'test', 'ytdlp', None, False, '127.0.0.1', True, + False, '/headers', True, False, None, False, None, None, {})) + data = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers')).read() + self.assertIn(b'Cookie: test=ytdlp', data) + + def test_no_compression_compat_header(self): + with FakeYDL() as ydl: + data = ydl.urlopen( + sanitized_Request( + f'http://127.0.0.1:{self.http_port}/headers', + headers={'Youtubedl-no-compression': True})).read() + self.assertIn(b'Accept-Encoding: identity', data) + self.assertNotIn(b'youtubedl-no-compression', data.lower()) + + def test_gzip_trailing_garbage(self): + # https://github.com/ytdl-org/youtube-dl/commit/aa3e950764337ef9800c936f4de89b31c00dfcf5 + # https://github.com/ytdl-org/youtube-dl/commit/6f2ec15cee79d35dba065677cad9da7491ec6e6f + with FakeYDL() as ydl: + data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8') + self.assertEqual(data, '<html><video src="/vid.mp4" /></html>') class TestClientCert(unittest.TestCase): @@ -112,8 +327,8 @@ def _run_test(self, **params): 'nocheckcertificate': True, **params, }) - r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) - self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) + r = ydl.extract_info(f'https://127.0.0.1:{self.port}/video.html') + self.assertEqual(r['url'], f'https://127.0.0.1:{self.port}/vid.mp4') def test_certificate_combined_nopass(self): self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt')) @@ -188,5 +403,22 @@ def test_proxy_with_idn(self): self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') +class TestFileURL(unittest.TestCase): + # See https://github.com/ytdl-org/youtube-dl/issues/8227 + def test_file_urls(self): + tf = tempfile.NamedTemporaryFile(delete=False) + tf.write(b'foobar') + tf.close() + url = pathlib.Path(tf.name).as_uri() + with FakeYDL() as ydl: + self.assertRaisesRegex( + urllib.error.URLError, 'file:// URLs are explicitly disabled in yt-dlp for security reasons', ydl.urlopen, url) + with FakeYDL({'enable_file_urls': True}) as ydl: + res = ydl.urlopen(url) + self.assertEqual(res.read(), b'foobar') + res.close() + os.unlink(tf.name) + + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 238b0fe694..d78022295b 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1664,61 +1664,44 @@ class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): The code is based on HTTPRedirectHandler implementation from CPython [1]. - This redirect handler solves two issues: - - ensures redirect URL is always unicode under python 2 - - introduces support for experimental HTTP response status code - 308 Permanent Redirect [2] used by some sites [3] + This redirect handler fixes and improves the logic to better align with RFC7261 + and what browsers tend to do [2][3] 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py - 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 - 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + 2. https://datatracker.ietf.org/doc/html/rfc7231 + 3. https://github.com/python/cpython/issues/91306 """ http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 def redirect_request(self, req, fp, code, msg, headers, newurl): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a - redirection response is received. If a redirection should - take place, return a new Request to allow http_error_30x to - perform the redirect. Otherwise, raise HTTPError if no-one - else should try to handle this url. Return None if you can't - but another Handler might. - """ - m = req.get_method() - if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): + if code not in (301, 302, 303, 307, 308): raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) - # Strictly (according to RFC 2616), 301 or 302 in response to - # a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib.request, in this case). In practice, - # essentially all clients do redirect in this case, so we do - # the same. - - # Be conciliant with URIs containing a space. This is mainly - # redundant with the more complete encoding done in http_error_302(), - # but it is kept for compatibility with other callers. - newurl = newurl.replace(' ', '%20') - - CONTENT_HEADERS = ("content-length", "content-type") - # NB: don't use dict comprehension for python 2.6 compatibility - newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS} + new_method = req.get_method() + new_data = req.data + remove_headers = [] # A 303 must either use GET or HEAD for subsequent request # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 - if code == 303 and m != 'HEAD': - m = 'GET' + if code == 303 and req.get_method() != 'HEAD': + new_method = 'GET' # 301 and 302 redirects are commonly turned into a GET from a POST # for subsequent requests by browsers, so we'll do the same. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 - if code in (301, 302) and m == 'POST': - m = 'GET' + elif code in (301, 302) and req.get_method() == 'POST': + new_method = 'GET' + + # only remove payload if method changed (e.g. POST to GET) + if new_method != req.get_method(): + new_data = None + remove_headers.extend(['Content-Length', 'Content-Type']) + + new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers} return urllib.request.Request( - newurl, headers=newheaders, origin_req_host=req.origin_req_host, - unverifiable=True, method=m) + newurl, headers=new_headers, origin_req_host=req.origin_req_host, + unverifiable=True, method=new_method, data=new_data) def extract_timezone(date_str): From b87e01c123fd560b6a674ce00f45a9459d82d98a Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 27 May 2023 19:08:19 +1200 Subject: [PATCH 111/501] [cookies] Move `YoutubeDLCookieJar` to cookies module (#7091) Authored by: coletdjnz --- test/test_YoutubeDLCookieJar.py | 8 +- yt_dlp/YoutubeDL.py | 7 +- yt_dlp/cookies.py | 144 +++++++++++++++++++++++++++++++- yt_dlp/extractor/common.py | 2 +- yt_dlp/utils/_legacy.py | 3 + yt_dlp/utils/_utils.py | 130 ---------------------------- 6 files changed, 157 insertions(+), 137 deletions(-) diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 0d4e7dc97c..2c73d7d853 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -11,7 +11,7 @@ import re import tempfile -from yt_dlp.utils import YoutubeDLCookieJar +from yt_dlp.cookies import YoutubeDLCookieJar class TestYoutubeDLCookieJar(unittest.TestCase): @@ -47,6 +47,12 @@ def test_malformed_cookies(self): # will be ignored self.assertFalse(cookiejar._cookies) + def test_get_cookie_header(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + header = cookiejar.get_cookie_header('https://www.foobar.foobar') + self.assertIn('HTTPONLY_COOKIE', header) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e1e5588363..f69bc98c55 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2404,7 +2404,7 @@ def _calc_headers(self, info_dict): if 'Youtubedl-No-Compression' in res: # deprecated res.pop('Youtubedl-No-Compression', None) res['Accept-Encoding'] = 'identity' - cookies = self._calc_cookies(info_dict['url']) + cookies = self.cookiejar.get_cookie_header(info_dict['url']) if cookies: res['Cookie'] = cookies @@ -2416,9 +2416,8 @@ def _calc_headers(self, info_dict): return res def _calc_cookies(self, url): - pr = sanitized_Request(url) - self.cookiejar.add_cookie_header(pr) - return pr.get_header('Cookie') + self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version') + return self.cookiejar.get_cookie_header(url) def _sort_thumbnails(self, thumbnails): thumbnails.sort(key=lambda t: ( diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 4cafb522e2..eb6a2656be 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,7 +1,9 @@ import base64 +import collections import contextlib import http.cookiejar import http.cookies +import io import json import os import re @@ -11,6 +13,7 @@ import sys import tempfile import time +import urllib.request from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -29,11 +32,14 @@ from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( Popen, - YoutubeDLCookieJar, error_to_str, + escape_url, expand_path, is_path_like, + sanitize_url, + str_or_none, try_call, + write_string, ) CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} @@ -1091,3 +1097,139 @@ def load(self, data): else: morsel = None + + +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ + _HTTPONLY_PREFIX = '#HttpOnly_' + _ENTRY_LEN = 7 + _HEADER = '''# Netscape HTTP Cookie File +# This file is generated by yt-dlp. Do not edit. + +''' + _CookieFileEntry = collections.namedtuple( + 'CookieFileEntry', + ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) + + def __init__(self, filename=None, *args, **kwargs): + super().__init__(None, *args, **kwargs) + if is_path_like(filename): + filename = os.fspath(filename) + self.filename = filename + + @staticmethod + def _true_or_false(cndn): + return 'TRUE' if cndn else 'FALSE' + + @contextlib.contextmanager + def open(self, file, *, write=False): + if is_path_like(file): + with open(file, 'w' if write else 'r', encoding='utf-8') as f: + yield f + else: + if write: + file.truncate(0) + yield file + + def _really_save(self, f, ignore_discard=False, ignore_expires=False): + now = time.time() + for cookie in self: + if (not ignore_discard and cookie.discard + or not ignore_expires and cookie.is_expired(now)): + continue + name, value = cookie.name, cookie.value + if value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name, value = '', name + f.write('%s\n' % '\t'.join(( + cookie.domain, + self._true_or_false(cookie.domain.startswith('.')), + cookie.path, + self._true_or_false(cookie.secure), + str_or_none(cookie.expires, default=''), + name, value + ))) + + def save(self, filename=None, *args, **kwargs): + """ + Save cookies to a file. + Code is taken from CPython 3.6 + https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ + + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + # Store session cookies with `expires` set to 0 instead of an empty string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + + with self.open(filename, write=True) as f: + f.write(self._HEADER) + self._really_save(f, *args, **kwargs) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + def prepare_line(line): + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + # comments and empty lines are fine + if line.startswith('#') or not line.strip(): + return line + cookie_list = line.split('\t') + if len(cookie_list) != self._ENTRY_LEN: + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) + cookie = self._CookieFileEntry(*cookie_list) + if cookie.expires_at and not cookie.expires_at.isdigit(): + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + return line + + cf = io.StringIO() + with self.open(filename) as f: + for line in f: + try: + cf.write(prepare_line(line)) + except http.cookiejar.LoadError as e: + if f'{line.strip()} '[0] in '[{"': + raise http.cookiejar.LoadError( + 'Cookies file must be Netscape formatted, not JSON. See ' + 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') + write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') + continue + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + def get_cookie_header(self, url): + """Generate a Cookie HTTP header for a given url""" + cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) + self.add_cookie_header(cookie_req) + return cookie_req.get_header('Cookie') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1b1dd560fd..306911a6c7 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3444,7 +3444,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None, def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index b0578a1d6b..1097778f0f 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -10,6 +10,9 @@ from .traversal import traverse_obj from ..dependencies import certifi, websockets +# isort: split +from ..cookies import YoutubeDLCookieJar # noqa: F401 + has_certifi = bool(certifi) has_websockets = bool(websockets) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index d78022295b..6f4f22bb31 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1518,136 +1518,6 @@ def is_path_like(f): return isinstance(f, (str, bytes, os.PathLike)) -class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): - """ - See [1] for cookie file format. - - 1. https://curl.haxx.se/docs/http-cookies.html - """ - _HTTPONLY_PREFIX = '#HttpOnly_' - _ENTRY_LEN = 7 - _HEADER = '''# Netscape HTTP Cookie File -# This file is generated by yt-dlp. Do not edit. - -''' - _CookieFileEntry = collections.namedtuple( - 'CookieFileEntry', - ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) - - def __init__(self, filename=None, *args, **kwargs): - super().__init__(None, *args, **kwargs) - if is_path_like(filename): - filename = os.fspath(filename) - self.filename = filename - - @staticmethod - def _true_or_false(cndn): - return 'TRUE' if cndn else 'FALSE' - - @contextlib.contextmanager - def open(self, file, *, write=False): - if is_path_like(file): - with open(file, 'w' if write else 'r', encoding='utf-8') as f: - yield f - else: - if write: - file.truncate(0) - yield file - - def _really_save(self, f, ignore_discard=False, ignore_expires=False): - now = time.time() - for cookie in self: - if (not ignore_discard and cookie.discard - or not ignore_expires and cookie.is_expired(now)): - continue - name, value = cookie.name, cookie.value - if value is None: - # cookies.txt regards 'Set-Cookie: foo' as a cookie - # with no name, whereas http.cookiejar regards it as a - # cookie with no value. - name, value = '', name - f.write('%s\n' % '\t'.join(( - cookie.domain, - self._true_or_false(cookie.domain.startswith('.')), - cookie.path, - self._true_or_false(cookie.secure), - str_or_none(cookie.expires, default=''), - name, value - ))) - - def save(self, filename=None, *args, **kwargs): - """ - Save cookies to a file. - Code is taken from CPython 3.6 - https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ - - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - - # Store session cookies with `expires` set to 0 instead of an empty string - for cookie in self: - if cookie.expires is None: - cookie.expires = 0 - - with self.open(filename, write=True) as f: - f.write(self._HEADER) - self._really_save(f, *args, **kwargs) - - def load(self, filename=None, ignore_discard=False, ignore_expires=False): - """Load cookies from a file.""" - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - - def prepare_line(line): - if line.startswith(self._HTTPONLY_PREFIX): - line = line[len(self._HTTPONLY_PREFIX):] - # comments and empty lines are fine - if line.startswith('#') or not line.strip(): - return line - cookie_list = line.split('\t') - if len(cookie_list) != self._ENTRY_LEN: - raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) - cookie = self._CookieFileEntry(*cookie_list) - if cookie.expires_at and not cookie.expires_at.isdigit(): - raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) - return line - - cf = io.StringIO() - with self.open(filename) as f: - for line in f: - try: - cf.write(prepare_line(line)) - except http.cookiejar.LoadError as e: - if f'{line.strip()} '[0] in '[{"': - raise http.cookiejar.LoadError( - 'Cookies file must be Netscape formatted, not JSON. See ' - 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') - write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') - continue - cf.seek(0) - self._really_load(cf, filename, ignore_discard, ignore_expires) - # Session cookies are denoted by either `expires` field set to - # an empty string or 0. MozillaCookieJar only recognizes the former - # (see [1]). So we need force the latter to be recognized as session - # cookies on our own. - # Session cookies may be important for cookies-based authentication, - # e.g. usually, when user does not check 'Remember me' check box while - # logging in on a site, some important cookies are stored as session - # cookies so that not recognizing them will result in failed login. - # 1. https://bugs.python.org/issue17164 - for cookie in self: - # Treat `expires=0` cookies as session cookies - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True - - class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): def __init__(self, cookiejar=None): urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) From 3f66b6fe50f8d5b545712f8b19d5ae62f5373980 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 27 May 2023 19:17:27 +1200 Subject: [PATCH 112/501] [core] Workaround erroneous urllib Windows proxy parsing (#7092) Convert proxies extracted from windows registry to http for older Python versions. See: https://github.com/python/cpython/issues/86793 Authored by: coletdjnz --- Makefile | 2 +- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/compat/urllib/__init__.py | 7 ++++++ yt_dlp/compat/urllib/request.py | 40 ++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/compat/urllib/__init__.py create mode 100644 yt_dlp/compat/urllib/request.py diff --git a/Makefile b/Makefile index f03fe20523..b1ac0e7d68 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ offlinetest: codetest $(PYTHON) -m pytest -k "not download" # XXX: This is hard to maintain -CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/utils yt_dlp/dependencies +CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/compat/urllib yt_dlp/utils yt_dlp/dependencies yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip for d in $(CODE_FOLDERS) ; do \ diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f69bc98c55..f49dbf07da 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -21,9 +21,9 @@ import tokenize import traceback import unicodedata -import urllib.request from .cache import Cache +from .compat import urllib # isort: split from .compat import compat_os_name, compat_shlex_quote from .cookies import load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name diff --git a/yt_dlp/compat/urllib/__init__.py b/yt_dlp/compat/urllib/__init__.py new file mode 100644 index 0000000000..6b6b8e103d --- /dev/null +++ b/yt_dlp/compat/urllib/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa: F405 +from urllib import * # noqa: F403 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib') +del passthrough_module diff --git a/yt_dlp/compat/urllib/request.py b/yt_dlp/compat/urllib/request.py new file mode 100644 index 0000000000..ff63b2f0e9 --- /dev/null +++ b/yt_dlp/compat/urllib/request.py @@ -0,0 +1,40 @@ +# flake8: noqa: F405 +from urllib.request import * # noqa: F403 + +from ..compat_utils import passthrough_module + +passthrough_module(__name__, 'urllib.request') +del passthrough_module + + +from .. import compat_os_name + +if compat_os_name == 'nt': + # On older python versions, proxies are extracted from Windows registry erroneously. [1] + # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2] + # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade + # it to http on these older python versions to avoid issues + # This also applies for ftp proxy type, as ftp:// proxy scheme is not supported. + # 1: https://github.com/python/cpython/issues/86793 + # 2: https://github.com/python/cpython/blob/51f1ae5ceb0673316c4e4b0175384e892e33cc6e/Lib/urllib/request.py#L2683-L2698 + import sys + from urllib.request import getproxies_environment, getproxies_registry + + def getproxies_registry_patched(): + proxies = getproxies_registry() + if ( + sys.version_info >= (3, 10, 5) # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final + or (3, 9, 13) <= sys.version_info < (3, 10) # https://docs.python.org/3.9/whatsnew/changelog.html#python-3-9-13-final + ): + return proxies + + for scheme in ('https', 'ftp'): + if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'): + proxies[scheme] = 'http' + proxies[scheme][len(scheme):] + + return proxies + + def getproxies(): + return getproxies_environment() or getproxies_registry_patched() + +del compat_os_name From daafbf49b3482edae4d70dd37070be99742a926e Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 27 May 2023 22:40:05 +1200 Subject: [PATCH 113/501] [core] Support decoding multiple content encodings (#7142) Authored by: coletdjnz --- test/test_http.py | 76 ++++++++++++++++++++++++++++++++++++++++++ yt_dlp/utils/_utils.py | 61 +++++++++++++++++---------------- 2 files changed, 108 insertions(+), 29 deletions(-) diff --git a/test/test_http.py b/test/test_http.py index d684905da5..3941a6e776 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -17,9 +17,11 @@ import threading import urllib.error import urllib.request +import zlib from test.helper import http_server_port from yt_dlp import YoutubeDL +from yt_dlp.dependencies import brotli from yt_dlp.utils import sanitized_Request, urlencode_postdata from .helper import FakeYDL @@ -148,6 +150,31 @@ def do_GET(self): self.send_header('Location', new_url) self.send_header('Content-Length', '0') self.end_headers() + elif self.path == '/content-encoding': + encodings = self.headers.get('ytdl-encoding', '') + payload = b'<html><video src="/vid.mp4" /></html>' + for encoding in filter(None, (e.strip() for e in encodings.split(','))): + if encoding == 'br' and brotli: + payload = brotli.compress(payload) + elif encoding == 'gzip': + buf = io.BytesIO() + with gzip.GzipFile(fileobj=buf, mode='wb') as f: + f.write(payload) + payload = buf.getvalue() + elif encoding == 'deflate': + payload = zlib.compress(payload) + elif encoding == 'unsupported': + payload = b'raw' + break + else: + self._status(415) + return + self.send_response(200) + self.send_header('Content-Encoding', encodings) + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + else: self._status(404) @@ -302,6 +329,55 @@ def test_gzip_trailing_garbage(self): data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8') self.assertEqual(data, '<html><video src="/vid.mp4" /></html>') + @unittest.skipUnless(brotli, 'brotli support is not installed') + def test_brotli(self): + with FakeYDL() as ydl: + res = ydl.urlopen( + sanitized_Request( + f'http://127.0.0.1:{self.http_port}/content-encoding', + headers={'ytdl-encoding': 'br'})) + self.assertEqual(res.headers.get('Content-Encoding'), 'br') + self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + + def test_deflate(self): + with FakeYDL() as ydl: + res = ydl.urlopen( + sanitized_Request( + f'http://127.0.0.1:{self.http_port}/content-encoding', + headers={'ytdl-encoding': 'deflate'})) + self.assertEqual(res.headers.get('Content-Encoding'), 'deflate') + self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + + def test_gzip(self): + with FakeYDL() as ydl: + res = ydl.urlopen( + sanitized_Request( + f'http://127.0.0.1:{self.http_port}/content-encoding', + headers={'ytdl-encoding': 'gzip'})) + self.assertEqual(res.headers.get('Content-Encoding'), 'gzip') + self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + + def test_multiple_encodings(self): + # https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4 + with FakeYDL() as ydl: + for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): + res = ydl.urlopen( + sanitized_Request( + f'http://127.0.0.1:{self.http_port}/content-encoding', + headers={'ytdl-encoding': pair})) + self.assertEqual(res.headers.get('Content-Encoding'), pair) + self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + + def test_unsupported_encoding(self): + # it should return the raw content + with FakeYDL() as ydl: + res = ydl.urlopen( + sanitized_Request( + f'http://127.0.0.1:{self.http_port}/content-encoding', + headers={'ytdl-encoding': 'unsupported'})) + self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported') + self.assertEqual(res.read(), b'raw') + class TestClientCert(unittest.TestCase): def setUp(self): diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 6f4f22bb31..7c91faff86 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1361,6 +1361,23 @@ def brotli(data): return data return brotli.decompress(data) + @staticmethod + def gz(data): + gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') + try: + return gz.read() + except OSError as original_oserror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') + return gz.read() + except OSError: + continue + else: + raise original_oserror + def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not # always respected by websites, some tend to give out URLs with non percent-encoded @@ -1394,35 +1411,21 @@ def http_request(self, req): def http_response(self, req, resp): old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - content = resp.read() - gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') - try: - uncompressed = io.BytesIO(gz.read()) - except OSError as original_ioerror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') - uncompressed = io.BytesIO(gz.read()) - except OSError: - continue - break - else: - raise original_ioerror - resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # brotli - if resp.headers.get('Content-encoding', '') == 'br': - resp = urllib.request.addinfourl( - io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) + + # Content-Encoding header lists the encodings in order that they were applied [1]. + # To decompress, we simply do the reverse. + # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding + decoded_response = None + for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): + if encoding == 'gzip': + decoded_response = self.gz(decoded_response or resp.read()) + elif encoding == 'deflate': + decoded_response = self.deflate(decoded_response or resp.read()) + elif encoding == 'br' and brotli: + decoded_response = self.brotli(decoded_response or resp.read()) + + if decoded_response is not None: + resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # https://github.com/ytdl-org/youtube-dl/issues/6457). From 6dc00acf0f1f1107a626c21befd1691403e6aeeb Mon Sep 17 00:00:00 2001 From: Mohamed Al Mehairbi <62325490+ItzMaxTV@users.noreply.github.com> Date: Sat, 27 May 2023 22:32:39 +0400 Subject: [PATCH 114/501] [extractor/weyyak] Add extractor (#7124) Closes #7118 Authored by: ItzMaxTV --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/weyyak.py | 86 +++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 yt_dlp/extractor/weyyak.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 49dd9aecd5..c288dca19b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2329,6 +2329,7 @@ WeverseLiveIE, ) from .wevidi import WeVidiIE +from .weyyak import WeyyakIE from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE diff --git a/yt_dlp/extractor/weyyak.py b/yt_dlp/extractor/weyyak.py new file mode 100644 index 0000000000..ef12be871f --- /dev/null +++ b/yt_dlp/extractor/weyyak.py @@ -0,0 +1,86 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_age_limit, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class WeyyakIE(InfoExtractor): + _VALID_URL = r'https?://weyyak\.com/(?P<lang>\w+)/(?:player/)?(?P<type>episode|movie)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://weyyak.com/en/player/episode/1341952/Ribat-Al-Hob-Episode49', + 'md5': '0caf55c1a615531c8fe60f146ae46849', + 'info_dict': { + 'id': '1341952', + 'ext': 'mp4', + 'title': 'Ribat Al Hob', + 'duration': 2771, + 'alt_title': 'رباط الحب', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 49', + 'episode_number': 49, + 'timestamp': 1485907200, + 'upload_date': '20170201', + 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image', + 'categories': ['Drama', 'Thrillers', 'Romance'], + 'tags': 'count:8', + }, + }, + { + 'url': 'https://weyyak.com/en/movie/233255/8-Seconds', + 'md5': 'fe740ae0f63e4d1c8a7fc147a410c564', + 'info_dict': { + 'id': '233255', + 'ext': 'mp4', + 'title': '8 Seconds', + 'duration': 6490, + 'alt_title': '8 ثواني', + 'description': 'md5:45b83a155c30b49950624c7e99600b9d', + 'age_limit': 15, + 'release_year': 2015, + 'timestamp': 1683106031, + 'upload_date': '20230503', + 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image', + 'categories': ['Drama', 'Social'], + 'cast': ['Ceylin Adiyaman', 'Esra Inal'], + }, + }, + ] + + def _real_extract(self, url): + video_id, lang, type_ = self._match_valid_url(url).group('id', 'lang', 'type') + + path = 'episode/' if type_ == 'episode' else 'contents/moviedetails?contentkey=' + data = self._download_json( + f'https://msapifo-prod-me.weyyak.z5.com/v1/{lang}/{path}{video_id}', video_id)['data'] + m3u8_url = self._download_json( + f'https://api-weyyak.akamaized.net/get_info/{data["video_id"]}', + video_id, 'Extracting video details')['url_video'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'alt_title': ('translated_title', {str}), + 'description': ('synopsis', {str}), + 'duration': ('length', {float_or_none}), + 'age_limit': ('age_rating', {parse_age_limit}), + 'season_number': ('season_number', {int_or_none}), + 'episode_number': ('episode_number', {int_or_none}), + 'thumbnail': ('imagery', 'thumbnail', {url_or_none}), + 'categories': ('genres', ..., {str}), + 'tags': ('tags', ..., {str}), + 'cast': (('main_actor', 'main_actress'), {str}), + 'timestamp': ('insertedAt', {unified_timestamp}), + 'release_year': ('production_year', {int_or_none}), + }), + } From 93e12ed76ef49252dc6869b59d21d0777e5e11af Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 28 May 2023 11:31:45 +1200 Subject: [PATCH 115/501] [extractor/youtube] Extract uploader metadata for feed/playlist items Fixes https://github.com/yt-dlp/yt-dlp/issues/7104 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 80edcd77da..3f0a4cd20a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1038,6 +1038,13 @@ def _extract_video(self, renderer): else self._get_count({'simpleText': view_count_text})) view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' + channel = (self._get_text(renderer, 'ownerText', 'shortBylineText') + or self._get_text(reel_header_renderer, 'channelTitleText')) + + channel_handle = traverse_obj(renderer, ( + 'shortBylineText', 'runs', ..., 'navigationEndpoint', + (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))), + expected_type=self.handle_from_url, get_all=False) return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -1047,9 +1054,11 @@ def _extract_video(self, renderer): 'description': description, 'duration': duration, 'channel_id': channel_id, - 'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText') - or self._get_text(reel_header_renderer, 'channelTitleText')), + 'channel': channel, 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'uploader': channel, + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), 'timestamp': (self._parse_time_text(time_text) if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) @@ -5851,7 +5860,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@colethedj1894', 'uploader': 'colethedj', }, + 'playlist': [{ + 'info_dict': { + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'id': 'BaW_jenozKc', + '_type': 'url', + 'ie_key': 'Youtube', + 'duration': 10, + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'view_count': int, + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc', + 'channel': 'Philipp Hagemeister', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader': 'Philipp Hagemeister', + } + }], 'playlist_count': 1, + 'params': {'extract_flat': True}, }, { 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', @@ -6152,6 +6179,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': str, 'concurrent_view_count': int, 'channel': str, + 'uploader': str, + 'uploader_url': str, + 'uploader_id': str } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, From 738c90a463257634455ada3e5c18b714c531dede Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" <lauren@selfisekai.rocks> Date: Mon, 29 May 2023 05:22:38 +0200 Subject: [PATCH 116/501] [extractor/polskieradio] Improve extractors (#5948) Authored by: selfisekai --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/polskieradio.py | 208 ++++++++++++++++++++----------- 2 files changed, 137 insertions(+), 72 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c288dca19b..ba55ccbaf8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1479,7 +1479,6 @@ PolskieRadioPlayerIE, PolskieRadioPodcastIE, PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 68c4a2afd0..5bf92b9b59 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -2,26 +2,24 @@ import json import math import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, - compat_urlparse -) +from ..compat import compat_str from ..utils import ( - determine_ext, - extract_attributes, ExtractorError, InAdvancePagedList, + determine_ext, + extract_attributes, int_or_none, js_to_json, parse_iso8601, strip_or_none, traverse_obj, - unified_timestamp, unescapeHTML, + unified_timestamp, url_or_none, + urljoin, ) @@ -44,7 +42,7 @@ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): 'duration': int_or_none(media.get('length')), 'vcodec': 'none' if media.get('provider') == 'audio' else None, }) - entry_title = compat_urllib_parse_unquote(media['desc']) + entry_title = urllib.parse.unquote(media['desc']) if entry_title: entry['title'] = entry_title yield entry @@ -130,10 +128,11 @@ def _real_extract(self, url): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioIE(InfoExtractor): - # new next.js sites, excluding radiokierowcow.pl - _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio(?:24)?\.pl/artykul/(?P<id>\d+)' +class PolskieRadioIE(PolskieRadioBaseExtractor): + # new next.js sites + _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' _TESTS = [{ + # articleData, attachments 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', 'info_dict': { 'id': '1587943', @@ -148,6 +147,31 @@ class PolskieRadioIE(InfoExtractor): 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', }, }], + }, { + # post, legacy html players + 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', + 'info_dict': { + 'id': '2589163', + 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', + 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2577880', + 'ext': 'mp3', + 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', + 'duration': 321, + }, + }], + }, { + # data, legacy + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', + }, + 'playlist_count': 3, }, { 'url': 'https://trojka.polskieradio.pl/artykul/1632955', 'only_matching': True, @@ -166,7 +190,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) article_data = traverse_obj( - self._search_nextjs_data(webpage, playlist_id), ('props', 'pageProps', 'data', 'articleData')) + self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) title = strip_or_none(article_data['title']) @@ -178,7 +203,13 @@ def _real_extract(self, url): 'id': self._search_regex( r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), 'title': strip_or_none(entry.get('description')) or title, - } for entry in article_data.get('attachments') or () if entry['fileType'] in ('Audio', )] + } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] + + if not entries: + # some legacy articles have no json attachments, but players in body + entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { + 'title': title, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -214,6 +245,15 @@ class PolskieRadioAuditionIE(InfoExtractor): 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, 'playlist_mincount': 722, + }, { + # some articles were "promoted to main page" and thus link to old frontend + 'url': 'https://trojka.polskieradio.pl/audycja/305', + 'info_dict': { + 'id': '305', + 'title': 'Co w mowie piszczy?', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_count': 1523, }] def _call_lp3(self, path, query, video_id, note): @@ -254,7 +294,6 @@ def _entries(self, playlist_id, has_episodes, has_articles): for article in page['data']: yield { '_type': 'url_transparent', - 'ie_key': PolskieRadioIE.ie_key(), 'id': str(article['id']), 'url': article['url'], 'title': article.get('shortTitle'), @@ -282,11 +321,8 @@ def _real_extract(self, url): class PolskieRadioCategoryIE(InfoExtractor): # legacy sites IE_NAME = 'polskieradio:category' - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', - 'only_matching': True - }, { 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', 'info_dict': { 'id': '4143', @@ -300,6 +336,36 @@ class PolskieRadioCategoryIE(InfoExtractor): 'title': 'Muzyka', }, 'playlist_mincount': 61 + }, { + # billennium tabs + 'url': 'https://www.polskieradio.pl/8/2385', + 'info_dict': { + 'id': '2385', + 'title': 'Droga przez mąkę', + }, + 'playlist_mincount': 111, + }, { + 'url': 'https://www.polskieradio.pl/10/4930', + 'info_dict': { + 'id': '4930', + 'title': 'Teraz K-pop!', + }, + 'playlist_mincount': 392, + }, { + # post back pages, audio content directly without articles + 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', + 'info_dict': { + 'id': '7376', + 'title': 'Nowa mowa', + }, + 'playlist_mincount': 244, + }, { + 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', + 'info_dict': { + 'id': '175458', + 'title': 'Krzysztof Dziuba', + }, + 'playlist_mincount': 420, }, { 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 'only_matching': True, @@ -311,25 +377,61 @@ def suitable(cls, url): def _entries(self, url, page, category_id): content = page + is_billennium_tabs = 'onclick="TB_LoadTab(' in page + is_post_back = 'onclick="__doPostBack(' in page + pagination = page if is_billennium_tabs else None for page_num in itertools.count(2): for a_entry, entry_id in re.findall( - r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', content): entry = extract_attributes(a_entry) - href = entry.get('href') - if not href: - continue - yield self.url_result( - compat_urlparse.urljoin(url, href), PolskieRadioLegacyIE, - entry_id, entry.get('title')) - mobj = re.search( - r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', - content) - if not mobj: - break - next_url = compat_urlparse.urljoin(url, mobj.group('url')) - content = self._download_webpage( - next_url, category_id, 'Downloading page %s' % page_num) + if entry.get('href'): + yield self.url_result( + urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) + for a_entry in re.findall(r'<span data-media=({[^ ]+})', content): + yield traverse_obj(self._parse_json(a_entry, category_id), { + 'url': 'file', + 'id': 'uid', + 'duration': 'length', + 'title': ('title', {urllib.parse.unquote}), + 'description': ('desc', {urllib.parse.unquote}), + }) + if is_billennium_tabs: + params = self._search_json( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(', + pagination, 'next page params', category_id, default=None, close_objects=1, + contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x))) + if not params: + break + tab_content = self._download_json( + 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', + category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, + data=json.dumps(dict(zip(( + 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', + 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', + 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber' + ), params))).encode())['d'] + content, pagination = tab_content['Content'], tab_content.get('PagerContent') + elif is_post_back: + target = self._search_regex( + r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)', + content, 'pagination postback target', group='target', default=None) + if not target: + break + content = self._download_webpage( + url, category_id, f'Downloading page {page_num}', + data=urllib.parse.urlencode({ + **self._hidden_inputs(content), + '__EVENTTARGET': target, + '__EVENTARGUMENT': 'Next', + }).encode()) + else: + next_url = urljoin(url, self._search_regex( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content, 'next page url', group='url', default=None)) + if not next_url: + break + content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') def _real_extract(self, url): category_id = self._match_id(url) @@ -337,7 +439,7 @@ def _real_extract(self, url): if PolskieRadioAuditionIE.suitable(urlh.url): return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) title = self._html_search_regex( - r'<title>([^<]+) - [^<]+ - [^<]+', + r'([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)', webpage, 'title', fatal=False) return self.playlist_result( self._entries(url, webpage, category_id), @@ -506,39 +608,3 @@ def _real_extract(self, url): 'Content-Type': 'application/json', }) return self._parse_episode(data[0]) - - -class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): - _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P[0-9]+)' - IE_NAME = 'polskieradio:kierowcow' - - _TESTS = [{ - 'url': 'https://radiokierowcow.pl/artykul/2694529', - 'info_dict': { - 'id': '2694529', - 'title': 'Zielona fala reliktem przeszłości?', - 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', - }, - 'playlist_count': 3, - }] - - def _real_extract(self, url): - media_id = self._match_id(url) - webpage = self._download_webpage(url, media_id) - nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] - article = self._download_json( - f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', - media_id) - data = article['pageProps']['data'] - title = data['title'] - entries = self._extract_webpage_player_entries(data['content'], media_id, { - 'title': title, - }) - - return { - '_type': 'playlist', - 'id': media_id, - 'entries': entries, - 'title': title, - 'description': data.get('lead'), - } From fc5a7f9b27d2a89b1f3ca7d33a95301c21d832cd Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sun, 28 May 2023 23:31:26 -0400 Subject: [PATCH 117/501] [extractor/daftsex] Update domain and embed player url (#5966) Closes #5881 Authored by: JChris246 --- yt_dlp/extractor/daftsex.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/daftsex.py b/yt_dlp/extractor/daftsex.py index 551d5e3abe..92510c767c 100644 --- a/yt_dlp/extractor/daftsex.py +++ b/yt_dlp/extractor/daftsex.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( + ExtractorError, int_or_none, js_to_json, parse_count, @@ -12,21 +13,24 @@ class DaftsexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P-?\d+_\d+)' + _VALID_URL = r'https?://(?:www\.)?daft\.sex/watch/(?P-?\d+_\d+)' _TESTS = [{ - 'url': 'https://daftsex.com/watch/-35370899_456246186', - 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'url': 'https://daft.sex/watch/-35370899_456246186', + 'md5': '64c04ef7b4c7b04b308f3b0c78efe7cd', 'info_dict': { 'id': '-35370899_456246186', 'ext': 'mp4', 'title': 'just relaxing', - 'description': 'just relaxing - Watch video Watch video in high quality', + 'description': 'just relaxing – Watch video Watch video in high quality', 'upload_date': '20201113', 'timestamp': 1605261911, - 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'duration': 15.0, + 'view_count': int }, }, { - 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'url': 'https://daft.sex/watch/-156601359_456242791', 'info_dict': { 'id': '-156601359_456242791', 'ext': 'mp4', @@ -36,6 +40,7 @@ class DaftsexIE(InfoExtractor): 'timestamp': 1600250735, 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', }, + 'skip': 'deleted / private' }] def _real_extract(self, url): @@ -60,7 +65,7 @@ def _real_extract(self, url): webpage, 'player color', fatal=False) or '' embed_page = self._download_webpage( - 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + 'https://dxb.to/player/%s?color=%s' % (player_hash, player_color), video_id, headers={'Referer': url}) video_params = self._parse_json( self._search_regex( @@ -94,15 +99,19 @@ def _real_extract(self, url): 'age_limit': 18, } - item = self._download_json( + items = self._download_json( f'{server_domain}/method/video.get/{video_id}', video_id, headers={'Referer': url}, query={ 'token': video_params['video']['access_token'], 'videos': video_id, 'ckey': video_params['c_key'], 'credentials': video_params['video']['credentials'], - })['response']['items'][0] + })['response']['items'] + if not items: + raise ExtractorError('Video is not available', video_id=video_id, expected=True) + + item = items[0] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': From aed945e1b9b7d3af2a907e1a12e6508cc81d6a20 Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" Date: Mon, 29 May 2023 06:07:45 +0200 Subject: [PATCH 118/501] [extractor/wykop] Add extractors (#6140) Authored by: selfisekai --- yt_dlp/extractor/_extractors.py | 6 + yt_dlp/extractor/wykop.py | 268 ++++++++++++++++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 yt_dlp/extractor/wykop.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ba55ccbaf8..bf041ae619 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2357,6 +2357,12 @@ WSJArticleIE, ) from .wwe import WWEIE +from .wykop import ( + WykopDigIE, + WykopDigCommentIE, + WykopPostIE, + WykopPostCommentIE, +) from .xanimu import XanimuIE from .xbef import XBefIE from .xboxclips import XboxClipsIE diff --git a/yt_dlp/extractor/wykop.py b/yt_dlp/extractor/wykop.py new file mode 100644 index 0000000000..0fa6d524db --- /dev/null +++ b/yt_dlp/extractor/wykop.py @@ -0,0 +1,268 @@ +import json +import urllib.error + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + format_field, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class WykopBaseExtractor(InfoExtractor): + def _get_token(self, force_refresh=False): + if not force_refresh: + maybe_cached = self.cache.load('wykop', 'bearer') + if maybe_cached: + return maybe_cached + + new_token = traverse_obj( + self._do_call_api('auth', None, 'Downloading anonymous auth token', data={ + # hardcoded in frontend + 'key': 'w53947240748', + 'secret': 'd537d9e0a7adc1510842059ae5316419', + }), ('data', 'token')) + + self.cache.store('wykop', 'bearer', new_token) + return new_token + + def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}): + if data: + data = json.dumps({'data': data}).encode() + headers['Content-Type'] = 'application/json' + + return self._download_json( + f'https://wykop.pl/api/v3/{path}', video_id, + note=note, data=data, headers=headers) + + def _call_api(self, path, video_id, note='Downloading JSON metadata'): + token = self._get_token() + for retrying in range(2): + try: + return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'}) + except ExtractorError as e: + if not retrying and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + token = self._get_token(True) + continue + raise + + def _common_data_extract(self, data): + author = traverse_obj(data, ('author', 'username'), expected_type=str) + + return { + '_type': 'url_transparent', + 'display_id': data.get('slug'), + 'url': traverse_obj(data, + ('media', 'embed', 'url'), # what gets an iframe embed + ('source', 'url'), # clickable url (dig only) + expected_type=url_or_none), + 'thumbnail': traverse_obj( + data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none), + 'uploader': author, + 'uploader_id': author, + 'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'), + 'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted + 'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int), + 'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int), + 'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int), + 'age_limit': 18 if data.get('adult') else 0, + 'tags': data.get('tags'), + } + + +class WykopDigIE(WykopBaseExtractor): + IE_NAME = 'wykop:dig' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', + 'info_dict': { + 'id': 'rlSTBvViflc', + 'ext': 'mp4', + 'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth', + 'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', + 'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87', + 'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'], + 'age_limit': 0, + 'timestamp': 1669154480, + 'release_timestamp': 1669194241, + 'release_date': '20221123', + 'uploader': 'starnak', + 'uploader_id': 'starnak', + 'uploader_url': 'https://wykop.pl/ludzie/starnak', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'view_count': int, + 'channel': 'BBC Earth', + 'channel_id': 'UCwmZiChSryoWQCZMIQezgTg', + 'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg', + 'categories': ['Pets & Animals'], + 'upload_date': '20220923', + 'duration': 191, + 'channel_follower_count': int, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + }, + }] + + @classmethod + def suitable(cls, url): + return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'links/{video_id}', video_id)['data'] + + return { + **self._common_data_extract(data), + 'id': video_id, + 'title': data['title'], + 'description': data.get('description'), + # time it got "digged" to the homepage + 'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '), + } + + +class WykopDigCommentIE(WykopBaseExtractor): + IE_NAME = 'wykop:dig:comment' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P\d+)/[^/]+/komentarz/(?P\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g', + 'info_dict': { + 'id': 'u6tEi2FmKZY', + 'ext': 'mp4', + 'title': 'md5:e7c741c5baa7ed6478000caf72865577', + 'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db', + 'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e', + 'timestamp': 1674476945, + 'uploader': 'Bartholomew', + 'uploader_id': 'Bartholomew', + 'uploader_url': 'https://wykop.pl/ludzie/Bartholomew', + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'tags': [], + 'availability': 'public', + 'duration': 1838, + 'upload_date': '20230117', + 'categories': ['Entertainment'], + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'age_limit': 0, + 'chapters': 'count:3', + 'channel': 'Poszukiwacze Okazji', + 'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw', + 'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw', + }, + }] + + def _real_extract(self, url): + dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id')) + data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data'] + + return { + **self._common_data_extract(data), + 'id': comment_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } + + +class WykopPostIE(WykopBaseExtractor): + IE_NAME = 'wykop:post' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek', + 'info_dict': { + 'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI', + 'title': 'PawelW124 - #kot #koty #smiesznykotek', + 'description': '#kot #koty #smiesznykotek', + 'display_id': 'kot-koty-smiesznykotek', + 'tags': ['kot', 'koty', 'smiesznykotek'], + 'uploader': 'PawelW124', + 'uploader_id': 'PawelW124', + 'uploader_url': 'https://wykop.pl/ludzie/PawelW124', + 'timestamp': 1668938142, + 'age_limit': 0, + 'like_count': int, + 'dislike_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'comment_count': int, + 'channel': 'Revan', + 'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw', + 'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw', + 'upload_date': '20221120', + 'modified_date': '20220814', + 'availability': 'public', + 'view_count': int, + }, + 'playlist_mincount': 15, + 'params': { + 'flat_playlist': True, + } + }] + + @classmethod + def suitable(cls, url): + return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'entries/{video_id}', video_id)['data'] + + return { + **self._common_data_extract(data), + 'id': video_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } + + +class WykopPostCommentIE(WykopBaseExtractor): + IE_NAME = 'wykop:post:comment' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P\d+)/[^/#]+#(?P\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979', + 'info_dict': { + 'id': 'confusedquickarmyant', + 'ext': 'mp4', + 'title': 'tpap - treść komentarza', + 'display_id': 'tresc-komentarza', + 'description': 'treść komentarza', + 'uploader': 'tpap', + 'uploader_id': 'tpap', + 'uploader_url': 'https://wykop.pl/ludzie/tpap', + 'timestamp': 1675349470, + 'upload_date': '20230202', + 'tags': [], + 'duration': 2.12, + 'age_limit': 0, + 'categories': [], + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + }, + }] + + def _real_extract(self, url): + post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id')) + data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data'] + + return { + **self._common_data_extract(data), + 'id': comment_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } From c6d3f81a4077aaf9cffc6aa2d0dec92f38e74bb0 Mon Sep 17 00:00:00 2001 From: nixxo Date: Mon, 29 May 2023 06:20:03 +0200 Subject: [PATCH 119/501] [extractor/rai] Rewrite extractors (#5940) Authored by: nixxo, danog Closes #5672, closes #6341 Co-authored-by: Daniil Gentili --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/rai.py | 575 +++++++++++++++----------------- 2 files changed, 271 insertions(+), 307 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bf041ae619..d9028a8310 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1548,6 +1548,8 @@ RadLiveSeasonIE, ) from .rai import ( + RaiIE, + RaiCulturaIE, RaiPlayIE, RaiPlayLiveIE, RaiPlayPlaylistIE, @@ -1556,7 +1558,6 @@ RaiPlaySoundPlaylistIE, RaiNewsIE, RaiSudtirolIE, - RaiIE, ) from .raywenderlich import ( RayWenderlichIE, diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index cab12cc214..df4102a409 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -1,19 +1,12 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( clean_html, determine_ext, ExtractorError, filter_dict, - find_xpath_attr, - fix_xml_ampersands, GeoRestrictedError, - HEADRequest, int_or_none, join_nonempty, parse_duration, @@ -35,82 +28,70 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): + def fix_cdata(s): + # remove \r\n\t before and after to avoid + # polluted text with xpath_text + s = re.sub(r'(\]\]>)[\r\n\t]+()[\r\n\t]+( 0 else None, + 'format_id': join_nonempty('https', bitrate, delim='-'), + }) + else: + raise ExtractorError('Unrecognized media file found') - if xpath_text(relinker, './license_url', default='{}') != '{}': - self.report_drm(video_id) - - if not geoprotection: - geoprotection = xpath_text( - relinker, './geoprotection', default=None) == 'Y' - - if not is_live: - is_live = xpath_text( - relinker, './is_live', default=None) == 'Y' - if not duration: - duration = parse_duration(xpath_text( - relinker, './duration', default=None)) - - url_elem = find_xpath_attr(relinker, './url', 'type', 'content') - if url_elem is None: - continue - - media_url = url_elem.text - - # This does not imply geo restriction (e.g. - # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if '/video_no_available.mp4' in media_url: - continue - - ext = determine_ext(media_url) - if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): - continue - - if ext == 'mp3': - formats.append({ - 'url': media_url, - 'vcodec': 'none', - 'acodec': 'mp3', - 'format_id': 'http-mp3', - }) - break - elif ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m' or platform == 'flash': - manifest_url = update_url_query( - media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), - {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) - formats.extend(self._extract_f4m_formats( - manifest_url, video_id, f4m_id='hds', fatal=False)) - else: - bitrate = int_or_none(xpath_text(relinker, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': f'http-{bitrate if bitrate > 0 else "http"}', - }) - - if not formats and geoprotection is True: + if (not formats and geoprotection is True) or '/video_no_available.mp4' in media_url: self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) - if not audio_only: - formats.extend(self._create_http_urls(relinker_url, formats)) + if not audio_only and not is_live: + formats.extend(self._create_http_urls(media_url, relinker_url, formats)) return filter_dict({ 'is_live': is_live, @@ -118,38 +99,31 @@ def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): 'formats': formats, }) - def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\w+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + def _create_http_urls(self, manifest_url, relinker_url, fmts): + _MANIFEST_REG = r'/(?P\w+)(?:_(?P[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h - '250': [352, 198], - '400': [512, 288], - '700': [512, 288], - '800': [700, 394], - '1200': [736, 414], - '1800': [1024, 576], - '2400': [1280, 720], - '3200': [1440, 810], - '3600': [1440, 810], - '5000': [1920, 1080], - '10000': [1920, 1080], + 250: [352, 198], + 400: [512, 288], + 600: [512, 288], + 700: [512, 288], + 800: [700, 394], + 1200: [736, 414], + 1500: [920, 518], + 1800: [1024, 576], + 2400: [1280, 720], + 3200: [1440, 810], + 3600: [1440, 810], + 5000: [1920, 1080], + 10000: [1920, 1080], } - def test_url(url): - resp = self._request_webpage( - HEADRequest(url), None, headers={'User-Agent': 'Rai'}, - fatal=False, errnote=False, note=False) - - if resp is False: + def percentage(number, target, pc=20, roof=125): + '''check if the target is in the range of number +/- percent''' + if not number or number < 0: return False - - if resp.code == 200: - return False if resp.url == url else resp.url - return None - - # filter out audio-only formats - fmts = [f for f in fmts if not f.get('vcodec') == 'none'] + return abs(target - number) < min(float(number) * float(pc) / 100.0, roof) def get_format_info(tbr): import math @@ -157,67 +131,78 @@ def get_format_info(tbr): if len(fmts) == 1 and not br: br = fmts[0].get('tbr') if br and br > 300: - tbr = compat_str(math.floor(br / 100) * 100) + tbr = math.floor(br / 100) * 100 else: - tbr = '250' + tbr = 250 # try extracting info from available m3u8 formats - format_copy = None + format_copy = [None, None] for f in fmts: if f.get('tbr'): - br_limit = math.floor(br / 100) - if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: - format_copy = f.copy() + if percentage(tbr, f['tbr']): + format_copy[0] = f.copy() + if [f.get('width'), f.get('height')] == _QUALITY.get(tbr): + format_copy[1] = f.copy() + format_copy[1]['tbr'] = tbr + + # prefer format with similar bitrate because there might be + # multiple video with the same resolution but different bitrate + format_copy = format_copy[0] or format_copy[1] or {} return { + 'format_id': f'https-{tbr}', 'width': format_copy.get('width'), 'height': format_copy.get('height'), 'tbr': format_copy.get('tbr'), 'vcodec': format_copy.get('vcodec'), 'acodec': format_copy.get('acodec'), 'fps': format_copy.get('fps'), - 'format_id': f'https-{tbr}', } if format_copy else { + 'format_id': f'https-{tbr}', 'width': _QUALITY[tbr][0], 'height': _QUALITY[tbr][1], - 'format_id': f'https-{tbr}', - 'tbr': int(tbr), + 'tbr': tbr, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + 'fps': 25, } - loc = test_url(_MP4_TMPL % (relinker_url, '*')) - if not isinstance(loc, compat_str): - return [] + # filter out single-stream formats + fmts = [f for f in fmts + if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none'] - mobj = re.match( - _RELINKER_REG, - test_url(relinker_url) or '') + mobj = re.search(_MANIFEST_REG, manifest_url) if not mobj: return [] - available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] - available_qualities = [i for i in available_qualities if i] formats = [] - for q in available_qualities: - fmt = { + for q in filter(None, available_qualities): + self.write_debug(f'Creating https format for quality {q}') + formats.append({ 'url': _MP4_TMPL % (relinker_url, q), 'protocol': 'https', 'ext': 'mp4', **get_format_info(q) - } - formats.append(fmt) + }) return formats + @staticmethod + def _get_thumbnails_list(thumbs, url): + return [{ + 'url': urljoin(url, thumb_url), + } for thumb_url in (thumbs or {}).values() if thumb_url] + @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' SRT_EXT = 'srt' subtitles = {} - subtitles_array = video_data.get('subtitlesArray') or [] + subtitles_array = video_data.get('subtitlesArray') or video_data.get('subtitleList') or [] for k in ('subtitles', 'subtitlesUrl'): subtitles_array.append({'url': video_data.get(k)}) for subtitle in subtitles_array: sub_url = subtitle.get('url') - if sub_url and isinstance(sub_url, compat_str): + if sub_url and isinstance(sub_url, str): sub_lang = subtitle.get('language') or 'it' sub_url = urljoin(url, sub_url) sub_ext = determine_ext(sub_url, SRT_EXT) @@ -236,7 +221,7 @@ def _extract_subtitles(url, video_data): class RaiPlayIE(RaiBaseIE): _VALID_URL = rf'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P{RaiBaseIE._UUID_RE}))\.(?:html|json)' _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', @@ -244,22 +229,20 @@ class RaiPlayIE(RaiBaseIE): 'title': 'Report del 07/04/2014', 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai Gulp', + 'thumbnail': r're:^https?://www\.raiplay\.it/.+\.jpg', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', 'duration': 6160, 'series': 'Report', 'season': '2013/14', - 'subtitles': { - 'it': 'count:4', - }, + 'subtitles': {'it': 'count:4'}, 'release_year': 2022, 'episode': 'Espresso nel caffè - 07/04/2014', 'timestamp': 1396919880, 'upload_date': '20140408', + 'formats': 'count:4', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }, { # 1080p direct mp4 url 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', @@ -270,8 +253,9 @@ class RaiPlayIE(RaiBaseIE): 'title': 'Blanca - S1E1 - Senza occhi', 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 1', + 'thumbnail': r're:^https://www\.raiplay\.it/dl/img/.+\.jpg', + 'uploader': 'Rai Premium', + 'creator': 'Rai Fiction', 'duration': 6493, 'series': 'Blanca', 'season': 'Season 1', @@ -281,6 +265,30 @@ class RaiPlayIE(RaiBaseIE): 'episode': 'Senza occhi', 'timestamp': 1637318940, 'upload_date': '20211119', + 'formats': 'count:12', + }, + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] + }, { + # 1500 quality + 'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html', + 'md5': 'a634d20e8ab2d43724c273563f6bf87a', + 'info_dict': { + 'id': '0cab3323-732e-45d6-8e86-7704acab6598', + 'ext': 'mp4', + 'title': 'Mia and Me - S1E11 - Tutto ciò che luccica', + 'alt_title': 'St 1 Ep 11 - Mia and Me - Tutto ciò che luccica', + 'description': 'md5:4969e594184b1920c4c1f2b704da9dea', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai Gulp', + 'series': 'Mia and Me', + 'season': 'Season 1', + 'episode_number': 11, + 'release_year': 2015, + 'season_number': 1, + 'episode': 'Tutto ciò che luccica', + 'timestamp': 1348495020, + 'upload_date': '20120924', }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', @@ -299,57 +307,40 @@ def _real_extract(self, url): base, video_id = self._match_valid_url(url).groups() media = self._download_json( - base + '.json', video_id, 'Downloading video JSON') + f'{base}.json', video_id, 'Downloading video JSON') if not self.get_param('allow_unplayable_formats'): - if try_get( - media, - (lambda x: x['rights_management']['rights']['drm'], - lambda x: x['program_info']['rights_management']['rights']['drm']), - dict): + if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): self.report_drm(video_id) - title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - - thumbnails = [] - for _, value in media.get('images', {}).items(): - if value: - thumbnails.append({ - 'url': urljoin(url, value), - }) - - date_published = media.get('date_published') - time_published = media.get('time_published') - if date_published and time_published: - date_published += ' ' + time_published - - subtitles = self._extract_subtitles(url, video) - - program_info = media.get('program_info') or {} + date_published = join_nonempty( + media.get('date_published'), media.get('time_published'), delim=' ') season = media.get('season') - alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') return { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, - 'title': title, + 'title': media.get('name'), 'alt_title': strip_or_none(alt_title or None), 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel') or None), - 'creator': strip_or_none(media.get('editor') or None), + 'uploader': strip_or_none( + traverse_obj(media, ('program_info', 'channel')) + or media.get('channel') or None), + 'creator': strip_or_none( + traverse_obj(media, ('program_info', 'editor')) + or media.get('editor') or None), 'duration': parse_duration(video.get('duration')), 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, - 'series': program_info.get('name'), + 'thumbnails': self._get_thumbnails_list(media.get('images'), url), + 'series': traverse_obj(media, ('program_info', 'name')), 'season_number': int_or_none(season), 'season': season if (season and not season.isdigit()) else None, 'episode': media.get('episode_title'), 'episode_number': int_or_none(media.get('episode')), - 'subtitles': subtitles, + 'subtitles': self._extract_subtitles(url, video), 'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), **relinker_info } @@ -371,38 +362,39 @@ class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE 'live_status': 'is_live', 'upload_date': '20090502', 'timestamp': 1241276220, + 'formats': 'count:3', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }] class RaiPlayPlaylistIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+))(?:/(?P[^?#&]+))?' _TESTS = [{ + # entire series episodes + extras... 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, - 'playlist_mincount': 12, + 'playlist_mincount': 30, }, { + # single season 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo - Stagione 2', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, - 'playlist_mincount': 12, + 'playlist_count': 12, }] def _real_extract(self, url): base, playlist_id, extra_id = self._match_valid_url(url).groups() program = self._download_json( - base + '.json', playlist_id, 'Downloading program JSON') + f'{base}.json', playlist_id, 'Downloading program JSON') if extra_id: extra_id = extra_id.upper().rstrip('/') @@ -450,7 +442,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'title': 'Il Ruggito del Coniglio del 10/12/2021', 'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg$', 'uploader': 'rai radio 2', 'duration': 5685, 'series': 'Il Ruggito del Coniglio', @@ -459,9 +451,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'timestamp': 1638346620, 'upload_date': '20211201', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): @@ -480,9 +470,6 @@ def _real_extract(self, url): lambda x: x['live']['create_date'])) podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {} - thumbnails = [{ - 'url': urljoin(url, thumb_url), - } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url] return { **info, @@ -494,7 +481,7 @@ def _real_extract(self, url): 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, + 'thumbnails': self._get_thumbnails_list(podcast_info.get('images'), url), 'series': podcast_info.get('title'), 'season_number': int_or_none(media.get('season')), 'episode': media.get('episode_title'), @@ -512,30 +499,30 @@ class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete 'display_id': 'radio2', 'ext': 'mp4', 'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', - 'thumbnail': r're:https://www.raiplaysound.it/dl/img/.+?png', + 'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png', 'uploader': 'rai radio 2', 'series': 'Rai Radio 2', 'creator': 'raiplaysound', 'is_live': True, 'live_status': 'is_live', }, - 'params': { - 'skip_download': 'live', - }, + 'params': {'skip_download': True}, }] class RaiPlaySoundPlaylistIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P[^/?#&]+))(?:/(?P[^?#&]+))?' _TESTS = [{ + # entire show 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio', 'info_dict': { 'id': 'ilruggitodelconiglio', 'title': 'Il Ruggito del Coniglio', - 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3', + 'description': 'md5:48cff6972435964284614d70474132e6', }, 'playlist_mincount': 65, }, { + # single season 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995', 'info_dict': { 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995', @@ -568,22 +555,19 @@ def _real_extract(self, url): class RaiIE(RaiBaseIE): _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' _TESTS = [{ - # var uniquename = "ContentItem-..." - # data-id="ContentItem-..." 'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1758, 'upload_date': '20140612', }, - 'skip': 'This content is available only in Italy', + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] }, { - # with ContentItem in og:url 'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -592,123 +576,51 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2214, 'upload_date': '20161103' - } + }, + 'params': {'skip_download': True}, }, { - # Direct MMS URL + # Direct MMS: Media URL no longer works. 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, }] - def _extract_from_content_id(self, content_id, url): + def _real_extract(self, url): + content_id = self._match_id(url) media = self._download_json( f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', - content_id, 'Downloading video JSON') + content_id, 'Downloading video JSON', fatal=False, expected_status=404) - title = media['name'].strip() + if media is None: + return None - media_type = media['type'] - if 'Audio' in media_type: + if 'Audio' in media['type']: relinker_info = { 'formats': [{ - 'format_id': media.get('formatoAudio'), + 'format_id': join_nonempty('https', media.get('formatoAudio'), delim='-'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), + 'vcodec': 'none', + 'acodec': media.get('formatoAudio'), }] } - elif 'Video' in media_type: + elif 'Video' in media['type']: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) else: raise ExtractorError('not a media file') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(url, thumbnail_url), - }) - - subtitles = self._extract_subtitles(url, media) + thumbnails = self._get_thumbnails_list( + {image_type: media.get(image_type) for image_type in ( + 'image', 'image_medium', 'image_300')}, url) return { 'id': content_id, - 'title': title, - 'description': strip_or_none(media.get('desc') or None), + 'title': strip_or_none(media.get('name') or media.get('title')), + 'description': strip_or_none(media.get('desc')) or None, 'thumbnails': thumbnails, - 'uploader': strip_or_none(media.get('author') or None), + 'uploader': strip_or_none(media.get('author')) or None, 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'subtitles': subtitles, - **relinker_info - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - content_item_id = None - - content_item_url = self._html_search_meta( - ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', - 'twitter:player', 'jsonlink'), webpage, default=None) - if content_item_url: - content_item_id = self._search_regex( - rf'ContentItem-({self._UUID_RE})', content_item_url, - 'content item id', default=None) - - if not content_item_id: - content_item_id = self._search_regex( - rf'''(?x) - (?: - (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)=| - ]+\bsrc= - ) - (["\']) - (?:(?!\1).)*\bContentItem-(?P{self._UUID_RE}) - ''', - webpage, 'content item id', default=None, group='id') - - content_item_ids = set() - if content_item_id: - content_item_ids.add(content_item_id) - if video_id not in content_item_ids: - content_item_ids.add(video_id) - - for content_item_id in content_item_ids: - try: - return self._extract_from_content_id(content_item_id, url) - except GeoRestrictedError: - raise - except ExtractorError: - pass - - relinker_url = self._proto_relative_url(self._search_regex( - r'''(?x) - (?: - var\s+videoURL| - mediaInfo\.mediaUri - )\s*=\s* - ([\'"]) - (?P - (?:https?:)? - //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? - (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 - ''', - webpage, 'relinker URL', group='url')) - - relinker_info = self._extract_relinker_info( - urljoin(url, relinker_url), video_id) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P[^\'"]+)\1', - webpage, 'title', group='title', - default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, + 'subtitles': self._extract_subtitles(url, media), **relinker_info } @@ -726,7 +638,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'duration': 1589, 'upload_date': '20220529', 'uploader': 'rainews', - } + }, + 'params': {'skip_download': True}, }, { # old content with fallback method to extract media urls 'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -739,12 +652,14 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'duration': 833, 'upload_date': '20161103' }, + 'params': {'skip_download': True}, 'expected_warnings': ['unable to extract player_data'], }, { # iframe + drm 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', 'only_matching': True, }] + _PLAYER_TAG = 'news' def _real_extract(self, url): video_id = self._match_id(url) @@ -752,8 +667,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) player_data = self._search_json( - r'<rainews-player\s*data=\'', webpage, 'player_data', video_id, - transform_source=clean_html, fatal=False) + rf'<rai{self._PLAYER_TAG}-player\s*data=\'', webpage, 'player_data', video_id, + transform_source=clean_html, default={}) track_info = player_data.get('track_info') relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') @@ -770,16 +685,36 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': track_info.get('title') or self._og_search_title(webpage), + 'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage), 'upload_date': unified_strdate(track_info.get('date')), 'uploader': strip_or_none(track_info.get('editor') or None), **relinker_info } -class RaiSudtirolIE(RaiBaseIE): - _VALID_URL = r'https?://raisudtirol\.rai\.it/.+?media=(?P<id>[TP]tv\d+)' +class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE + _VALID_URL = rf'https?://(www\.)?raicultura\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] _TESTS = [{ + 'url': 'https://www.raicultura.it/letteratura/articoli/2018/12/Alberto-Asor-Rosa-Letteratura-e-potere-05ba8775-82b5-45c5-a89d-dd955fbde1fb.html', + 'info_dict': { + 'id': '05ba8775-82b5-45c5-a89d-dd955fbde1fb', + 'ext': 'mp4', + 'title': 'Alberto Asor Rosa: Letteratura e potere', + 'duration': 1756, + 'upload_date': '20181206', + 'uploader': 'raicultura', + 'formats': 'count:2', + }, + 'params': {'skip_download': True}, + }] + _PLAYER_TAG = 'cultura' + + +class RaiSudtirolIE(RaiBaseIE): + _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P<id>\w+)' + _TESTS = [{ + # mp4 file 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', 'info_dict': { 'id': 'Ptv1619729460', @@ -787,34 +722,62 @@ class RaiSudtirolIE(RaiBaseIE): 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', 'series': 'Euro: trasmisciun d\'economia', 'upload_date': '20210429', - 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+?\.jpg', + 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+\.jpg', 'uploader': 'raisudtirol', - } + 'formats': 'count:1', + }, + 'params': {'skip_download': True}, + }, { + # m3u manifest + 'url': 'https://raisudtirol.rai.it/it/kidsplayer.php?lang=it&media=GUGGUG_P1.smil', + 'info_dict': { + 'id': 'GUGGUG_P1', + 'ext': 'mp4', + 'title': 'GUGGUG! La Prospettiva - Die Perspektive', + 'uploader': 'raisudtirol', + 'formats': 'count:6', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_date = self._html_search_regex(r'<span class="med_data">(.+?)</span>', webpage, 'video_date', fatal=False) - video_title = self._html_search_regex(r'<span class="med_title">(.+?)</span>', webpage, 'video_title', fatal=False) - video_url = self._html_search_regex(r'sources:\s*\[\{file:\s*"(.+?)"\}\]', webpage, 'video_url') - video_thumb = self._html_search_regex(r'image: \'(.+?)\'', webpage, 'video_thumb', fatal=False) + video_date = self._html_search_regex( + r'<span class="med_data">(.+?)</span>', webpage, 'video_date', default=None) + video_title = self._html_search_regex([ + r'<span class="med_title">(.+?)</span>', r'title: \'(.+?)\','], + webpage, 'video_title', default=None) + video_url = self._html_search_regex([ + r'sources:\s*\[\{file:\s*"(.+?)"\}\]', + r'<source\s+src="(.+?)"\s+type="application/x-mpegURL"'], + webpage, 'video_url', default=None) - return { - 'id': video_id, - 'title': join_nonempty(video_title, video_date, delim=' - '), - 'series': video_title, - 'upload_date': unified_strdate(video_date), - 'thumbnail': urljoin('https://raisudtirol.rai.it/', video_thumb), - 'uploader': 'raisudtirol', - 'formats': [{ + ext = determine_ext(video_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(video_url, video_id) + elif ext == 'mp4': + formats = [{ 'format_id': 'https-mp4', 'url': self._proto_relative_url(video_url), 'width': 1024, 'height': 576, 'fps': 25, - 'vcodec': 'h264', - 'acodec': 'aac', - }], + 'vcodec': 'avc1', + 'acodec': 'mp4a', + }] + else: + formats = [] + self.raise_no_formats(f'Unrecognized media file: {video_url}') + + return { + 'id': video_id, + 'title': join_nonempty(video_title, video_date, delim=' - '), + 'series': video_title if video_date else None, + 'upload_date': unified_strdate(video_date), + 'thumbnail': urljoin('https://raisudtirol.rai.it/', self._html_search_regex( + r'image: \'(.+?)\'', webpage, 'video_thumb', default=None)), + 'uploader': 'raisudtirol', + 'formats': formats, } From bfdf144c7e5d7a93fbfa9d8e65598c72bf2b542a Mon Sep 17 00:00:00 2001 From: Mohit Tokas <mohittokas@live.com> Date: Mon, 29 May 2023 10:16:32 +0530 Subject: [PATCH 120/501] [extractor/livestream] Support videos with account id (#6324) Authored by: theperfectpunk Closes #2225 --- yt_dlp/extractor/livestream.py | 96 +++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py index d883eafcff..692d6ab3a6 100644 --- a/yt_dlp/extractor/livestream.py +++ b/yt_dlp/extractor/livestream.py @@ -1,33 +1,36 @@ -import re import itertools +import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str, compat_urlparse from ..utils import ( - find_xpath_attr, - xpath_attr, - xpath_with_ns, - xpath_text, - orderedSet, - update_url_query, - int_or_none, - float_or_none, - parse_iso8601, determine_ext, + find_xpath_attr, + float_or_none, + int_or_none, + orderedSet, + parse_iso8601, + traverse_obj, + update_url_query, + xpath_attr, + xpath_text, + xpath_with_ns, ) class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' - _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' + _VALID_URL = r'''(?x) + https?://(?:new\.)?livestream\.com/ + (?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+)) + (?:/events/(?P<event_id>\d+)|/(?P<event_name>[^/]+))? + (?:/videos/(?P<id>\d+))? + ''' _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"'] _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', - 'md5': '53274c76ba7754fb0e8d072716f2292b', + 'md5': '7876c5f5dc3e711b6b73acce4aac1527', 'info_dict': { 'id': '4719370', 'ext': 'mp4', @@ -37,22 +40,37 @@ class LivestreamIE(InfoExtractor): 'duration': 5968.0, 'like_count': int, 'view_count': int, + 'comment_count': int, 'thumbnail': r're:^http://.*\.jpg$' } }, { - 'url': 'http://new.livestream.com/tedx/cityenglish', + 'url': 'https://livestream.com/coheedandcambria/websterhall', 'info_dict': { - 'title': 'TEDCity2.0 (English)', - 'id': '2245590', + 'id': '1585861', + 'title': 'Live From Webster Hall' + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://livestream.com/dayananda/events/7954027', + 'info_dict': { + 'title': 'Live from Mevo', + 'id': '7954027', }, 'playlist_mincount': 4, }, { - 'url': 'http://new.livestream.com/chess24/tatasteelchess', + 'url': 'https://livestream.com/accounts/82', 'info_dict': { - 'title': 'Tata Steel Chess', - 'id': '3705884', - }, - 'playlist_mincount': 60, + 'id': '253978', + 'view_count': int, + 'title': 'trsr', + 'comment_count': int, + 'like_count': int, + 'upload_date': '20120306', + 'timestamp': 1331042383, + 'thumbnail': 'http://img.new.livestream.com/videos/0000000000000372/cacbeed6-fb68-4b5e-ad9c-e148124e68a9_640x427.jpg', + 'duration': 15.332, + 'ext': 'mp4' + } }, { 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', 'only_matching': True, @@ -179,7 +197,7 @@ def _extract_stream_info(self, stream_info): 'is_live': is_live, } - def _extract_event(self, event_data): + def _generate_event_playlist(self, event_data): event_id = compat_str(event_data['id']) account_id = compat_str(event_data['owner_account_id']) feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' @@ -189,7 +207,6 @@ def _extract_event(self, event_data): return self._extract_stream_info(stream_info) last_video = None - entries = [] for i in itertools.count(1): if last_video is None: info_url = feed_root_url @@ -197,31 +214,38 @@ def _extract_event(self, event_data): info_url = '{root}?&id={id}&newer=-1&type=video'.format( root=feed_root_url, id=last_video) videos_info = self._download_json( - info_url, event_id, 'Downloading page {0}'.format(i))['data'] + info_url, event_id, f'Downloading page {i}')['data'] videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] if not videos_info: break for v in videos_info: v_id = compat_str(v['id']) - entries.append(self.url_result( - 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), - 'Livestream', v_id, v.get('caption'))) + yield self.url_result( + f'http://livestream.com/accounts/{account_id}/events/{event_id}/videos/{v_id}', + LivestreamIE, v_id, v.get('caption')) last_video = videos_info[-1]['id'] - return self.playlist_result(entries, event_id, event_data['full_name']) def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') event = mobj.group('event_id') or mobj.group('event_name') account = mobj.group('account_id') or mobj.group('account_name') - api_url = self._API_URL_TEMPLATE % (account, event) + api_url = f'http://livestream.com/api/accounts/{account}' + if video_id: video_data = self._download_json( - api_url + '/videos/%s' % video_id, video_id) + f'{api_url}/events/{event}/videos/{video_id}', video_id) return self._extract_video_info(video_data) - else: - event_data = self._download_json(api_url, video_id) - return self._extract_event(event_data) + elif event: + event_data = self._download_json(f'{api_url}/events/{event}', None) + return self.playlist_result( + self._generate_event_playlist(event_data), str(event_data['id']), event_data['full_name']) + + account_data = self._download_json(api_url, None) + items = traverse_obj(account_data, (('upcoming_events', 'past_events'), 'data', ...)) + return self.playlist_result( + itertools.chain.from_iterable(map(self._generate_event_playlist, items)), + account_data.get('id'), account_data.get('full_name')) # The original version of Livestream uses a different system From 17d7ca84ea723c20668bd9bfa938be7ea0e64f6b Mon Sep 17 00:00:00 2001 From: Ha Tien Loi <loiht.b17vt220@stu.ptit.edu.vn> Date: Mon, 29 May 2023 12:02:16 +0700 Subject: [PATCH 121/501] [extractor/zingmp3] Fix and improve extractors (#6367) Authored by: hatienl0i261299 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/zingmp3.py | 101 ++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d9028a8310..6066b809b2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2483,6 +2483,7 @@ ZingMp3WeekChartIE, ZingMp3ChartMusicVideoIE, ZingMp3UserIE, + ZingMp3HubIE, ) from .zoom import ZoomIE from .zype import ZypeIE diff --git a/yt_dlp/extractor/zingmp3.py b/yt_dlp/extractor/zingmp3.py index a818c9fa9d..007658c659 100644 --- a/yt_dlp/extractor/zingmp3.py +++ b/yt_dlp/extractor/zingmp3.py @@ -1,16 +1,11 @@ -import functools import hashlib import hmac +import itertools import json import urllib.parse from .common import InfoExtractor -from ..utils import ( - OnDemandPagedList, - int_or_none, - traverse_obj, - urljoin, -) +from ..utils import int_or_none, traverse_obj, try_call, urljoin class ZingMp3BaseIE(InfoExtractor): @@ -37,6 +32,7 @@ class ZingMp3BaseIE(InfoExtractor): 'info-artist': '/api/v2/page/get/artist', 'user-list-song': '/api/v2/song/get/list', 'user-list-video': '/api/v2/video/get/list', + 'hub': '/api/v2/page/get/hub-detail', } def _api_url(self, url_type, params): @@ -46,9 +42,9 @@ def _api_url(self, url_type, params): ''.join(f'{k}={v}' for k, v in sorted(params.items())).encode()).hexdigest() data = { **params, - 'apiKey': '88265e23d4284f25963e6eedac8fbfa3', - 'sig': hmac.new( - b'2aa2d1c561e809b267f3638c4a307aab', f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), + 'apiKey': 'X5BM3w8N7MKozC0B85o4KMlzLZKhV00y', + 'sig': hmac.new(b'acOrvUS15XRW2o9JksiK1KgQ6Vbds8ZW', + f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), } return f'{self._DOMAIN}{api_slug}?{urllib.parse.urlencode(data)}' @@ -67,6 +63,19 @@ def _parse_items(self, items): for url in traverse_obj(items, (..., 'link')) or []: yield self.url_result(urljoin(self._DOMAIN, url)) + def _fetch_page(self, id_, url_type, page): + raise NotImplementedError('This method must be implemented by subclasses') + + def _paged_list(self, _id, url_type): + count = 0 + for page in itertools.count(1): + data = self._fetch_page(_id, url_type, page) + entries = list(self._parse_items(data.get('items'))) + count += len(entries) + yield from entries + if not data.get('hasMore') or try_call(lambda: count > data['total']): + break + class ZingMp3IE(ZingMp3BaseIE): _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed' @@ -166,8 +175,11 @@ def _real_extract(self, url): 'height': int_or_none(res), }) - if not formats and item.get('msg') == 'Sorry, this content is not available in your country.': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + if not formats: + if item.get('msg') == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + else: + self.raise_no_formats('The song is only for VIP accounts.') lyric = item.get('lyric') or self._call_api('lyric', {'id': item_id}, fatal=False).get('file') @@ -200,7 +212,7 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): 'id': 'ZWZAEZZD', 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro', }, - 'playlist_mincount': 49, + 'playlist_mincount': 20, }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, @@ -305,22 +317,20 @@ class ZingMp3ChartMusicVideoIE(ZingMp3BaseIE): 'id': 'IWZ9Z086', 'title': 'the-loai-video_Khong-Loi', }, - 'playlist_mincount': 10, + 'playlist_mincount': 1, }] def _fetch_page(self, song_id, url_type, page): - return self._parse_items(self._call_api(url_type, { + return self._call_api(url_type, { 'id': song_id, 'type': 'genre', - 'page': page + 1, + 'page': page, 'count': self._PER_PAGE - }).get('items')) + }) def _real_extract(self, url): song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') - return self.playlist_result( - OnDemandPagedList(functools.partial(self._fetch_page, song_id, url_type), self._PER_PAGE), - song_id, f'{url_type}_{regions}') + return self.playlist_result(self._paged_list(song_id, url_type), song_id, f'{url_type}_{regions}') class ZingMp3UserIE(ZingMp3BaseIE): @@ -331,7 +341,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - bai-hat', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 91, }, { @@ -339,7 +349,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - album', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 3, }, { @@ -347,7 +357,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - single', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 20, }, { @@ -355,19 +365,19 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - video', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 15, }] def _fetch_page(self, user_id, url_type, page): url_type = 'user-list-song' if url_type == 'bai-hat' else 'user-list-video' - return self._parse_items(self._call_api(url_type, { + return self._call_api(url_type, { 'id': user_id, 'type': 'artist', - 'page': page + 1, + 'page': page, 'count': self._PER_PAGE - }, query={'sort': 'new', 'sectionId': 'aSong'}).get('items')) + }) def _real_extract(self, url): user_alias, url_type = self._match_valid_url(url).group('user', 'type') @@ -376,10 +386,41 @@ def _real_extract(self, url): user_info = self._call_api('info-artist', {}, user_alias, query={'alias': user_alias}) if url_type in ('bai-hat', 'video'): - entries = OnDemandPagedList( - functools.partial(self._fetch_page, user_info['id'], url_type), self._PER_PAGE) + entries = self._paged_list(user_info['id'], url_type) else: entries = self._parse_items(traverse_obj(user_info, ( - 'sections', lambda _, v: v['link'] == f'/{user_alias}/{url_type}', 'items', ...))) + 'sections', + lambda _, v: v['sectionId'] == 'aAlbum' if url_type == 'album' else v['sectionId'] == 'aSingle', + 'items', ...))) return self.playlist_result( entries, user_info['id'], f'{user_info.get("name")} - {url_type}', user_info.get('biography')) + + +class ZingMp3HubIE(ZingMp3BaseIE): + IE_NAME = 'zingmp3:hub' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>hub)/(?P<regions>[^/]+)/(?P<id>[^\.]+)' + _TESTS = [{ + 'url': 'https://zingmp3.vn/hub/Nhac-Moi/IWZ9Z0CA.html', + 'info_dict': { + 'id': 'IWZ9Z0CA', + 'title': 'Nhạc Mới', + 'description': 'md5:1cc31b68a6f746427b07b2756c22a558', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://zingmp3.vn/hub/Nhac-Viet/IWZ9Z087.html', + 'info_dict': { + 'id': 'IWZ9Z087', + 'title': 'Nhạc Việt', + 'description': 'md5:acc976c8bdde64d5c6ee4a92c39f7a77', + }, + 'playlist_mincount': 30, + }] + + def _real_extract(self, url): + song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') + hub_detail = self._call_api(url_type, {'id': song_id}) + entries = self._parse_items(traverse_obj(hub_detail, ( + 'sections', lambda _, v: v['sectionId'] == 'hub', 'items', ...))) + return self.playlist_result( + entries, song_id, hub_detail.get('title'), hub_detail.get('description')) From c6d4b82a8b8bce59b1c9ce5e6d349ea428dac0a7 Mon Sep 17 00:00:00 2001 From: Daniel Vogt <c0d3d3v@mag-keinen-spam.de> Date: Mon, 29 May 2023 07:21:26 +0200 Subject: [PATCH 122/501] [extractor/owncloud] Add extractor (#6533) Authored by: C0D3D3V --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/owncloud.py | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 yt_dlp/extractor/owncloud.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6066b809b2..b022442849 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1378,6 +1378,7 @@ ORFIPTVIE, ) from .outsidetv import OutsideTVIE +from .owncloud import OwnCloudIE from .packtpub import ( PacktPubIE, PacktPubCourseIE, diff --git a/yt_dlp/extractor/owncloud.py b/yt_dlp/extractor/owncloud.py new file mode 100644 index 0000000000..e1d5682f87 --- /dev/null +++ b/yt_dlp/extractor/owncloud.py @@ -0,0 +1,80 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + url_or_none, + urlencode_postdata, +) + + +class OwnCloudIE(InfoExtractor): + _INSTANCES_RE = '|'.join(( + r'(?:[^\.]+\.)?sciebo\.de', + r'cloud\.uni-koblenz-landau\.de', + )) + _VALID_URL = rf'https?://(?:{_INSTANCES_RE})/s/(?P<id>[\w.-]+)' + + _TESTS = [ + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/wWhqZzh9jTumVFN', + 'info_dict': { + 'id': 'wWhqZzh9jTumVFN', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + }, + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/WNDuFu0XuFtmm3f', + 'info_dict': { + 'id': 'WNDuFu0XuFtmm3f', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + 'params': { + 'videopassword': '12345', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, video_id) + + if re.search(r'<label[^>]+for="password"', webpage): + webpage = self._verify_video_password(webpage, urlh.geturl(), video_id) + + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs.get('filename') + parsed_url = urllib.parse.urlparse(url) + + return { + 'id': video_id, + 'title': title, + 'url': url_or_none(hidden_inputs.get('downloadURL')) or parsed_url._replace( + path=urllib.parse.urljoin(parsed_url.path, 'download')).geturl(), + 'ext': determine_ext(title), + } + + def _verify_video_password(self, webpage, url, video_id): + password = self.get_param('videopassword') + if password is None: + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + + validation_response = self._download_webpage( + url, video_id, 'Validating Password', 'Wrong password?', + data=urlencode_postdata({ + 'requesttoken': self._hidden_inputs(webpage)['requesttoken'], + 'password': password, + })) + + if re.search(r'<label[^>]+for="password"', validation_response): + warning = self._search_regex( + r'<div[^>]+class="warning">([^<]*)</div>', validation_response, + 'warning', default='The password is wrong') + raise ExtractorError(f'Opening the video failed, {self.IE_NAME} said: {warning!r}', expected=True) + return validation_response From 94627c5dde12a72766bdba36e056916c29c40ed1 Mon Sep 17 00:00:00 2001 From: Stefan Borer <stefan.borer@gmail.com> Date: Mon, 29 May 2023 07:26:49 +0200 Subject: [PATCH 123/501] [extractor/playsuisse] Support new url format (#6528) Authored by: sbor23 --- yt_dlp/extractor/playsuisse.py | 88 ++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index a635ac92f1..76288c7789 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -5,10 +5,16 @@ class PlaySuisseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)' _TESTS = [ { + # Old URL 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'only_matching': True, + }, + { + # episode in a series + 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', 'info_dict': { 'id': '763211', @@ -21,11 +27,11 @@ class PlaySuisseIE(InfoExtractor): 'season_number': 1, 'episode': 'Knochen', 'episode_number': 1, - 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878' + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', } - }, - { - 'url': 'https://www.playsuisse.ch/watch/808675/0', + }, { + # film + 'url': 'https://www.playsuisse.ch/watch/808675', 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', 'info_dict': { 'id': '808675', @@ -33,26 +39,60 @@ class PlaySuisseIE(InfoExtractor): 'title': 'Der Läufer', 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', 'duration': 5280, - 'episode': 'Der Läufer', - 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783' + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', } - }, - { - 'url': 'https://www.playsuisse.ch/watch/817193/0', - 'md5': '1d6c066f92cd7fffd8b28a53526d6b59', + }, { + # series (treated as a playlist) + 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'id': '817193', - 'ext': 'mp4', - 'title': 'Die Einweihungsparty', - 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930', - 'duration': 1380, - 'series': 'Nr. 47', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Die Einweihungsparty', - 'episode_number': 1, - 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44' - } + 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', + 'id': '1115687', + 'series': 'They all came out to Montreux', + 'title': 'They all came out to Montreux', + }, + 'playlist': [{ + 'info_dict': { + 'description': 'md5:f2462744834b959a31adc6292380cda2', + 'duration': 3180, + 'episode': 'Folge 1', + 'episode_number': 1, + 'id': '1112663', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 1', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:9dfd308699fe850d3bce12dc1bad9b27', + 'duration': 2935, + 'episode': 'Folge 2', + 'episode_number': 2, + 'id': '1112661', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 2', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:14a93a3356b2492a8f786ab2227ef602', + 'duration': 2994, + 'episode': 'Folge 3', + 'episode_number': 3, + 'id': '1112664', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 3', + 'ext': 'mp4' + } + }], } ] @@ -142,6 +182,6 @@ def _extract_single(self, media_data): 'subtitles': subtitles, 'series': media_data.get('seriesName'), 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name'), + 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, 'episode_number': int_or_none(media_data.get('episodeNumber')), } From 02312c03cf53eb1da24c9ad022ee79af26060733 Mon Sep 17 00:00:00 2001 From: bepvte <8226605+bepvte@users.noreply.github.com> Date: Sun, 28 May 2023 22:54:36 -0700 Subject: [PATCH 124/501] [extractor/twitch] Support mobile clips (#6699) Authored by: bepvte --- yt_dlp/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 9b333f6f67..d7a1cc531a 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -1075,7 +1075,7 @@ class TwitchClipsIE(TwitchBaseIE): https?:// (?: clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| - (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/)?clip/ ) (?P<id>[^/?#&]+) ''' @@ -1111,6 +1111,9 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/clip/FaintLightGullWholeWheat', + 'only_matching': True, }] def _real_extract(self, url): From 5c14b213679ed4401288bdc86ae696932e219222 Mon Sep 17 00:00:00 2001 From: ping <ping@users.noreply.github.com> Date: Mon, 29 May 2023 14:01:42 +0800 Subject: [PATCH 125/501] [extractor/idolplus] Add extractor (#6732) Authored by: ping Closes #6246 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/idolplus.py | 115 ++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 yt_dlp/extractor/idolplus.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b022442849..999b113783 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -789,6 +789,7 @@ IchinanaLiveIE, IchinanaLiveClipIE, ) +from .idolplus import IdolPlusIE from .ign import ( IGNIE, IGNVideoIE, diff --git a/yt_dlp/extractor/idolplus.py b/yt_dlp/extractor/idolplus.py new file mode 100644 index 0000000000..3c905b0712 --- /dev/null +++ b/yt_dlp/extractor/idolplus.py @@ -0,0 +1,115 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, try_call, url_or_none + + +class IdolPlusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?idolplus\.com/z[us]/(?:concert/|contents/?\?(?:[^#]+&)?albumId=)(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://idolplus.com/zs/contents?albumId=M012077298PPV00', + 'md5': '2ace3f4661c943a2f7e79f0b88cea1e7', + 'info_dict': { + 'id': 'M012077298PPV00', + 'ext': 'mp4', + 'title': '[MultiCam] Aegyo on Top of Aegyo (IZ*ONE EATING TRIP)', + 'release_date': '20200707', + 'formats': 'count:65', + }, + 'params': {'format': '532-KIM_MINJU'}, + }, { + 'url': 'https://idolplus.com/zs/contents?albumId=M01232H058PPV00&catId=E9TX5', + 'info_dict': { + 'id': 'M01232H058PPV00', + 'ext': 'mp4', + 'title': 'YENA (CIRCLE CHART MUSIC AWARDS 2022 RED CARPET)', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # live stream + 'url': 'https://idolplus.com/zu/contents?albumId=M012323174PPV00', + 'info_dict': { + 'id': 'M012323174PPV00', + 'ext': 'mp4', + 'title': 'Hanteo Music Awards 2022 DAY2', + 'release_date': '20230211', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://idolplus.com/zs/concert/M012323039PPV00', + 'info_dict': { + 'id': 'M012323039PPV00', + 'ext': 'mp4', + 'title': 'CIRCLE CHART MUSIC AWARDS 2022', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data_list = traverse_obj(self._download_json( + 'https://idolplus.com/api/zs/viewdata/ruleset/build', video_id, + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'rulesetId': 'contents', + 'albumId': video_id, + 'distribute': 'PRD', + 'loggedIn': 'false', + 'region': 'zs', + 'countryGroup': '00010', + 'lang': 'en', + 'saId': '999999999998', + }), ('data', 'viewData', ...)) + + player_data = {} + while data_list: + player_data = data_list.pop() + if traverse_obj(player_data, 'type') == 'player': + break + elif traverse_obj(player_data, ('dataList', ...)): + data_list += player_data['dataList'] + + formats = self._extract_m3u8_formats(traverse_obj(player_data, ( + 'vodPlayerList', 'vodProfile', 0, 'vodServer', 0, 'video_url', {url_or_none})), video_id) + + subtitles = {} + for caption in traverse_obj(player_data, ('vodPlayerList', 'caption')) or []: + subtitles.setdefault(caption.get('lang') or 'und', []).append({ + 'url': caption.get('smi_url'), + 'ext': 'vtt', + }) + + # Add member multicams as alternative formats + if (traverse_obj(player_data, ('detail', 'has_cuesheet')) == 'Y' + and traverse_obj(player_data, ('detail', 'is_omni_member')) == 'Y'): + cuesheet = traverse_obj(self._download_json( + 'https://idolplus.com/gapi/contents/v1.0/content/cuesheet', video_id, + 'Downloading JSON metadata for member multicams', + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'ALBUM_ID': video_id, + 'COUNTRY_GRP': '00010', + 'LANG': 'en', + 'SA_ID': '999999999998', + 'COUNTRY_CODE': 'KR', + }), ('data', 'cuesheet_item', 0)) + + for member in traverse_obj(cuesheet, ('members', ...)): + index = try_call(lambda: int(member['omni_view_index']) - 1) + member_video_url = traverse_obj(cuesheet, ('omni_view', index, 'cdn_url', 0, 'url', {url_or_none})) + if not member_video_url: + continue + member_formats = self._extract_m3u8_formats( + member_video_url, video_id, note=f'Downloading m3u8 for multicam {member["name"]}') + for mf in member_formats: + mf['format_id'] = f'{mf["format_id"]}-{member["name"].replace(" ", "_")}' + formats.extend(member_formats) + + return { + 'id': video_id, + 'title': traverse_obj(player_data, ('detail', 'albumName')), + 'formats': formats, + 'subtitles': subtitles, + 'release_date': traverse_obj(player_data, ('detail', 'broadcastDate')), + } From 4afb208cf07b59291ae3b0c4efc83945ee5b8812 Mon Sep 17 00:00:00 2001 From: jo-nike <derter@gmail.com> Date: Mon, 29 May 2023 02:04:08 -0400 Subject: [PATCH 126/501] [extractor/cbc] Ignore 426 from API (#6781) Closes #6716 Authored by: jo-nike --- yt_dlp/extractor/cbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index e42f062464..41e092422b 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -351,7 +351,9 @@ def _find_secret_formats(self, formats, video_id): def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + video_info = self._download_json( + f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', + video_id, expected_status=426) email, password = self._get_login_info() if email and password: @@ -426,7 +428,7 @@ def _real_extract(self, url): match = self._match_valid_url(url) season_id = match.group('id') show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id) + show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) season = int(match.group('season')) season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) From a58182b75a05fe0a10c5e94a536711d3ade19c20 Mon Sep 17 00:00:00 2001 From: Nam Vu <git@yuru.moe> Date: Mon, 29 May 2023 15:05:51 +0900 Subject: [PATCH 127/501] [cookies] Support custom Safari cookies path (#6783) Authored by: NextFire --- yt_dlp/cookies.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index eb6a2656be..ee2af0f704 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -495,18 +495,22 @@ def decrypt(self, encrypted_value): def _extract_safari_cookies(profile, logger): - if profile is not None: - logger.error('safari does not support profiles') if sys.platform != 'darwin': raise ValueError(f'unsupported platform: {sys.platform}') - cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') - - if not os.path.isfile(cookies_path): - logger.debug('Trying secondary cookie location') - cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if profile: + cookies_path = os.path.expanduser(profile) if not os.path.isfile(cookies_path): - raise FileNotFoundError('could not find safari cookies database') + raise FileNotFoundError('custom safari cookies database not found') + + else: + cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') + + if not os.path.isfile(cookies_path): + logger.debug('Trying secondary cookie location') + cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if not os.path.isfile(cookies_path): + raise FileNotFoundError('could not find safari cookies database') with open(cookies_path, 'rb') as f: cookies_data = f.read() From c25cac2f8e5fbac2737a426d7778fd2f0efc5381 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 29 May 2023 01:40:44 -0500 Subject: [PATCH 128/501] [extractor/dacast] Add extractors (#6896) Closes #6163 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/dacast.py | 158 ++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 yt_dlp/extractor/dacast.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 999b113783..0f65f1cc7b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -428,6 +428,10 @@ CybraryIE, CybraryCourseIE ) +from .dacast import ( + DacastVODIE, + DacastPlaylistIE, +) from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py new file mode 100644 index 0000000000..cf683bad48 --- /dev/null +++ b/yt_dlp/extractor/dacast.py @@ -0,0 +1,158 @@ +import hashlib +import re +import time +import urllib.error + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + classproperty, + float_or_none, + traverse_obj, + url_or_none, +) + + +class DacastBaseIE(InfoExtractor): + _URL_TYPE = None + + @classproperty + def _VALID_URL(cls): + return fr'https?://iframe\.dacast\.com/{cls._URL_TYPE}/(?P<user_id>[\w-]+)/(?P<id>[\w-]+)' + + @classproperty + def _EMBED_REGEX(cls): + return [rf'<iframe[^>]+\bsrc=["\'](?P<url>{cls._VALID_URL})'] + + _API_INFO_URL = 'https://playback.dacast.com/content/info' + + @classmethod + def _get_url_from_id(cls, content_id): + user_id, media_id = content_id.split(f'-{cls._URL_TYPE}-') + return f'https://iframe.dacast.com/{cls._URL_TYPE}/{user_id}/{media_id}' + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for content_id in re.findall( + rf'<script[^>]+\bsrc=["\']https://player\.dacast\.com/js/player\.js\?contentId=([\w-]+-{cls._URL_TYPE}-[\w-]+)["\']', webpage): + yield cls._get_url_from_id(content_id) + + +class DacastVODIE(DacastBaseIE): + _URL_TYPE = 'vod' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/vod/acae82153ef4d7a7344ae4eaa86af534/1c6143e3-5a06-371d-8695-19b96ea49090', + 'info_dict': { + 'id': '1c6143e3-5a06-371d-8695-19b96ea49090', + 'ext': 'mp4', + 'uploader_id': 'acae82153ef4d7a7344ae4eaa86af534', + 'title': '2_4||Adnexal mass characterisation: O-RADS US and MRI||N. Bharwani, London/UK', + 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/', + 'info_dict': { + 'id': 'b6674869-f08a-23c5-1d7b-81f5309e1a90', + 'ext': 'mp4', + 'title': '4-HowToEmbedVideo.mp4', + 'uploader_id': '3b67c4a9-3886-4eb1-d0eb-39b23b14bef3', + 'thumbnail': 'https://universe-files.dacast.com/d26ab48f-a52a-8783-c42e-a90290ba06b6.png', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://gist.githubusercontent.com/bashonly/4ad249ef2910346fbdf3809b220f11ee/raw/87349778d4af1a80b1fcc3beb9c88108de5858f5/dacast_embeds.html', + 'info_dict': { + 'id': 'e7df418e-a83b-7a7f-7b5e-1a667981e8fa', + 'ext': 'mp4', + 'title': 'Evening Service 2-5-23', + 'uploader_id': '943bb1ab3c03695ba85330d92d6d226e', + 'thumbnail': 'https://universe-files.dacast.com/337472b3-e92c-2ea4-7eb7-5700da477f67', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) + access = self._download_json( + 'https://playback.dacast.com/content/access', video_id, + note='Downloading access JSON', query=query, expected_status=403) + + error = access.get('error') + if error in ('Broadcaster has been blocked', 'Content is offline'): + raise ExtractorError(error, expected=True) + elif error: + raise ExtractorError(f'Dacast API says "{error}"') + + hls_url = access['hls'] + hls_aes = {} + + if 'DRM_EXT' in hls_url: + self.report_drm(video_id) + elif '/uspaes/' in hls_url: + # From https://player.dacast.com/js/player.js + ts = int(time.time()) + signature = hashlib.sha1( + f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex() + hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}' + + for retry in self.RetryManager(): + try: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + # CDN will randomly respond with 403 + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + retry.error = e + continue + raise + + return { + 'id': video_id, + 'uploader_id': user_id, + 'formats': formats, + 'hls_aes': hls_aes or None, + **traverse_obj(info, ('contentInfo', { + 'title': 'title', + 'duration': ('duration', {float_or_none}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + })), + } + + +class DacastPlaylistIE(DacastBaseIE): + _URL_TYPE = 'playlist' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/playlist/943bb1ab3c03695ba85330d92d6d226e/b632eb053cac17a9c9a02bcfc827f2d8', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://gist.githubusercontent.com/bashonly/7efb606f49f3c6e07ea0327de5a661d1/raw/05a16eac830245ea301fb0a585023bec71e6093c/dacast_playlist_embed.html', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + + def _real_extract(self, url): + user_id, playlist_id = self._match_valid_url(url).group('user_id', 'id') + info = self._download_json( + self._API_INFO_URL, playlist_id, note='Downloading playlist JSON', query={ + 'contentId': f'{user_id}-playlist-{playlist_id}', + 'provider': 'universe', + })['contentInfo'] + + def entries(info): + for video in traverse_obj(info, ('features', 'playlist', 'contents', lambda _, v: v['id'])): + yield self.url_result( + DacastVODIE._get_url_from_id(video['id']), DacastVODIE, video['id'], video.get('title')) + + return self.playlist_result(entries(info), playlist_id, info.get('title')) From 3459d3c5af3b2572ed51e8ecfda6c11022a838c6 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Mon, 29 May 2023 18:33:37 +0900 Subject: [PATCH 129/501] [extractor/JStream] Add extractor (#6252) Authored by: Lesmiscore --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jstream.py | 73 +++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 yt_dlp/extractor/jstream.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0f65f1cc7b..d560ed91c8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -878,6 +878,7 @@ from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .joj import JojIE +from .jstream import JStreamIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE diff --git a/yt_dlp/extractor/jstream.py b/yt_dlp/extractor/jstream.py new file mode 100644 index 0000000000..3e2e627125 --- /dev/null +++ b/yt_dlp/extractor/jstream.py @@ -0,0 +1,73 @@ +import base64 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + remove_start, +) + + +class JStreamIE(InfoExtractor): + # group "id" only exists for compliance, not directly used in requests + # also all components are mandatory + _VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))' + + _TESTS = [{ + 'url': 'jstream:www50:eqd638pvwx:752', + 'info_dict': { + 'id': 'eqd638pvwx:752', + 'ext': 'mp4', + 'title': '阪神淡路大震災 激震の記録2020年版 解説動画', + 'duration': 672, + 'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg', + }, + }] + + def _parse_jsonp(self, callback, string, video_id): + return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id) + + def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles): + for value in movie_list_hls: + text = value.get('text') or '' + if not text.startswith('auto'): + continue + m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id) + self._merge_subtitles(subs, target=subtitles) + yield from fmts + + def _real_extract(self, url): + host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id') + video_info_jsonp = self._download_webpage( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp', + video_id, 'Requesting video info') + video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie'] + subtitles = {} + formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles)) + self._remove_duplicate_formats(formats) + return { + 'id': video_id, + 'title': video_info.get('title'), + 'duration': float_or_none(video_info.get('duration')), + 'thumbnail': video_info.get('thumbnail_url'), + 'formats': formats, + 'subtitles': subtitles, + } + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # check for eligiblity of webpage + # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89 + script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage) + if not script_tag: + return + host, publisher = script_tag.groups() + for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage): + # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod + info = json.loads(js_to_json(m.group(1))) + mid = base64.b64decode(info.get('m')).decode() + yield f'jstream:{host}:{publisher}:{mid}' From f8f9250fe280d37f0988646cd5cc0072f4d33a6d Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Mon, 29 May 2023 18:35:10 +0900 Subject: [PATCH 130/501] [extractor/niconico:live] Add extractor (#5764) Authored by: Lesmiscore --- yt_dlp/downloader/__init__.py | 3 +- yt_dlp/downloader/niconico.py | 101 +++++++++++++++++++- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/niconico.py | 163 ++++++++++++++++++++++++++++++++ 4 files changed, 266 insertions(+), 2 deletions(-) diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index c34dbcea95..51a9f28f06 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -30,7 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .http import HttpFD from .ism import IsmFD from .mhtml import MhtmlFD -from .niconico import NiconicoDmcFD +from .niconico import NiconicoDmcFD, NiconicoLiveFD from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD @@ -50,6 +50,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'ism': IsmFD, 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, + 'niconico_live': NiconicoLiveFD, 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 77ed39e5b9..cfe7397845 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -1,8 +1,17 @@ +import json import threading +import time from . import get_suitable_downloader from .common import FileDownloader -from ..utils import sanitized_Request +from .external import FFmpegFD +from ..utils import ( + DownloadError, + str_or_none, + sanitized_Request, + WebSocketsWrapper, + try_get, +) class NiconicoDmcFD(FileDownloader): @@ -50,3 +59,93 @@ def heartbeat(): timer[0].cancel() download_complete = True return success + + +class NiconicoLiveFD(FileDownloader): + """ Downloads niconico live without being stopped """ + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + ws_url = info_dict['url'] + ws_extractor = info_dict['ws'] + ws_origin_host = info_dict['origin'] + cookies = info_dict.get('cookies') + live_quality = info_dict.get('live_quality', 'high') + live_latency = info_dict.get('live_latency', 'high') + dl = FFmpegFD(self.ydl, self.params or {}) + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'protocol': 'm3u8', + }) + + def communicate_ws(reconnect): + if reconnect: + ws = WebSocketsWrapper(ws_url, { + 'Cookies': str_or_none(cookies) or '', + 'Origin': f'https://{ws_origin_host}', + 'Accept': '*/*', + 'User-Agent': self.params['http_headers']['User-Agent'], + }) + if self.ydl.params.get('verbose', False): + self.to_screen('[debug] Sending startWatching request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': live_quality, + 'protocol': 'hls+fmp4', + 'latency': live_latency, + 'chasePlay': False + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True + }, + 'reconnect': True, + } + })) + else: + ws = ws_extractor + with ws: + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not data or not isinstance(data, dict): + continue + if data.get('type') == 'ping': + # pong back + ws.send(r'{"type":"pong"}') + ws.send(r'{"type":"keepSeat"}') + elif data.get('type') == 'disconnect': + self.write_debug(data) + return True + elif data.get('type') == 'error': + self.write_debug(data) + message = try_get(data, lambda x: x['body']['code'], str) or recv + return DownloadError(message) + elif self.ydl.params.get('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.to_screen('[debug] Server said: %s' % recv) + + def ws_main(): + reconnect = False + while True: + try: + ret = communicate_ws(reconnect) + if ret is True: + return + except BaseException as e: + self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e))) + time.sleep(10) + continue + finally: + reconnect = True + + thread = threading.Thread(target=ws_main, daemon=True) + thread.start() + + return dl.download(filename, new_info_dict) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d560ed91c8..07249bba6b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1275,6 +1275,7 @@ NicovideoSearchIE, NicovideoSearchURLIE, NicovideoTagURLIE, + NiconicoLiveIE, ) from .ninecninemedia import ( NineCNineMediaIE, diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 30b4d7216f..89e8e60939 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -5,13 +5,17 @@ import re import time +from urllib.parse import urlparse + from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_HTTPError, ) +from ..dependencies import websockets from ..utils import ( ExtractorError, OnDemandPagedList, + WebSocketsWrapper, bug_reports_message, clean_html, float_or_none, @@ -895,3 +899,162 @@ def _entries(self, list_id): def _real_extract(self, url): list_id = self._match_id(url) return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + + +class NiconicoLiveIE(InfoExtractor): + IE_NAME = 'niconico:live' + IE_DESC = 'ニコニコ生放送' + _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)' + _TESTS = [{ + 'note': 'this test case includes invisible characters for title, pasting them as-is', + 'url': 'https://live.nicovideo.jp/watch/lv339533123', + 'info_dict': { + 'id': 'lv339533123', + 'title': '激辛ペヤング食べます‪( ;ᯅ; )‬(歌枠オーディション参加中)', + 'view_count': 1526, + 'comment_count': 1772, + 'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます', + 'uploader': 'もか', + 'channel': 'ゲストさんのコミュニティ', + 'channel_id': 'co5776900', + 'channel_url': 'https://com.nicovideo.jp/community/co5776900', + 'timestamp': 1670677328, + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }] + + _KNOWN_LATENCY = ('high', 'low') + + def _real_extract(self, url): + if not websockets: + raise ExtractorError('websockets library is not available. Please install it.', expected=True) + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) + + embedded_data = self._parse_json(unescapeHTML(self._search_regex( + r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id) + + ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl')) + if not ws_url: + raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True) + ws_url = update_url_query(ws_url, { + 'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9', + }) + + hostname = remove_start(urlparse(urlh.geturl()).hostname, 'sp.') + cookies = try_get(urlh.geturl(), self._downloader._calc_cookies) + latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) + if latency not in self._KNOWN_LATENCY: + latency = 'high' + + ws = WebSocketsWrapper(ws_url, { + 'Cookies': str_or_none(cookies) or '', + 'Origin': f'https://{hostname}', + 'Accept': '*/*', + 'User-Agent': self.get_param('http_headers')['User-Agent'], + }) + + self.write_debug('[debug] Sending HLS server request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': 'abr', + 'protocol': 'hls+fmp4', + 'latency': latency, + 'chasePlay': False + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True + }, + 'reconnect': False, + } + })) + + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not isinstance(data, dict): + continue + if data.get('type') == 'stream': + m3u8_url = data['data']['uri'] + qualities = data['data']['availableQualities'] + break + elif data.get('type') == 'disconnect': + self.write_debug(recv) + raise ExtractorError('Disconnected at middle of extraction') + elif data.get('type') == 'error': + self.write_debug(recv) + message = traverse_obj(data, ('body', 'code')) or recv + raise ExtractorError(message) + elif self.get_param('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.write_debug('Server said: %s' % recv) + + title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta( + ('og:title', 'twitter:title'), webpage, 'live title', fatal=False) + + raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {} + thumbnails = [] + for name, value in raw_thumbs.items(): + if not isinstance(value, dict): + thumbnails.append({ + 'id': name, + 'url': value, + **parse_resolution(value, lenient=True), + }) + continue + + for k, img_url in value.items(): + res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True) + width, height = res.get('width'), res.get('height') + + thumbnails.append({ + 'id': f'{name}_{width}x{height}', + 'url': img_url, + **res, + }) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) + for fmt, q in zip(formats, reversed(qualities[1:])): + fmt.update({ + 'format_id': q, + 'protocol': 'niconico_live', + 'ws': ws, + 'video_id': video_id, + 'cookies': cookies, + 'live_latency': latency, + 'origin': hostname, + }) + + return { + 'id': video_id, + 'title': title, + **traverse_obj(embedded_data, { + 'view_count': ('program', 'statistics', 'watchCount'), + 'comment_count': ('program', 'statistics', 'commentCount'), + 'uploader': ('program', 'supplier', 'name'), + 'channel': ('socialGroup', 'name'), + 'channel_id': ('socialGroup', 'id'), + 'channel_url': ('socialGroup', 'socialGroupPageUrl'), + }), + 'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))), + 'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))), + 'is_live': True, + 'thumbnails': thumbnails, + 'formats': formats, + } From fd5d93f7040f9776fd541f4e4079dad7d3b3fb4f Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Mon, 29 May 2023 04:42:03 -0500 Subject: [PATCH 131/501] Bugfix for b844a3f8b16500663e7ab6c6ec061cc9b30f71ac [extractor/weverse] Avoid unnecessary duplicate login Authored by: bashonly --- yt_dlp/extractor/weverse.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index ab629c885c..8f2a7ee06b 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -34,6 +34,9 @@ class WeverseBaseIE(InfoExtractor): } def _perform_login(self, username, password): + if self._API_HEADERS.get('Authorization'): + return + headers = { 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', 'x-acc-app-version': '2.2.6', From f6e43d6fa9804c24525e1fed0a87782754dab7ed Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 29 May 2023 05:07:35 -0500 Subject: [PATCH 132/501] [extractor/cbsnews] Overhaul extractors (#6681) Closes #6565 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 8 +- yt_dlp/extractor/anvato.py | 10 +- yt_dlp/extractor/cbslocal.py | 116 ---------- yt_dlp/extractor/cbsnews.py | 382 ++++++++++++++++++++++++++++---- 4 files changed, 346 insertions(+), 170 deletions(-) delete mode 100644 yt_dlp/extractor/cbslocal.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 07249bba6b..31af6bd3f8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -313,14 +313,14 @@ CBSIE, ParamountPressExpressIE, ) -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, CBSNewsIE, + CBSLocalIE, + CBSLocalArticleIE, + CBSLocalLiveIE, + CBSNewsLiveIE, CBSNewsLiveVideoIE, ) from .cbssports import ( diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index 79bfe412b2..0df50333c3 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -336,7 +336,7 @@ def _get_anvato_videos(self, access_key, video_id, token): elif media_format == 'm3u8-variant' or ext == 'm3u8': # For some videos the initial m3u8 URL returns JSON instead manifest_json = self._download_json( - video_url, video_id, note='Downloading manifest JSON', errnote=False) + video_url, video_id, note='Downloading manifest JSON', fatal=False) if manifest_json: video_url = manifest_json.get('master_m3u8') if not video_url: @@ -392,14 +392,6 @@ def _extract_from_webpage(cls, url, webpage): url = smuggle_url(url, {'token': anvplayer_data['token']}) yield cls.url_result(url, AnvatoIE, video_id) - def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json( - self._html_search_regex( - self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), - video_id) - return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) self._initialize_geo_bypass({ diff --git a/yt_dlp/extractor/cbslocal.py b/yt_dlp/extractor/cbslocal.py deleted file mode 100644 index 3d50b0499f..0000000000 --- a/yt_dlp/extractor/cbslocal.py +++ /dev/null @@ -1,116 +0,0 @@ -from .anvato import AnvatoIE -from .sendtonews import SendtoNewsIE -from ..compat import compat_urlparse -from ..utils import ( - parse_iso8601, - unified_timestamp, -) - - -class CBSLocalIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' - _VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mcp_id = self._match_id(url) - return self.url_result( - 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) - - -class CBSLocalArticleIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)' - - _TESTS = [{ - # Anvato backend - 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', - 'md5': 'f0ee3081e3843f575fccef901199b212', - 'info_dict': { - 'id': '3401037', - 'ext': 'mp4', - 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', - 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', - 'thumbnail': 're:^https?://.*', - 'timestamp': 1463440500, - 'upload_date': '20160516', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\KCBSTV', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\AOL', - 'Syndication\\Yahoo', - 'Syndication\\Tribune', - 'Syndication\\Curb.tv', - 'Content\\News' - ], - 'tags': ['CBS 2 News Evening'], - }, - }, { - # SendtoNews embed - 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', - 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - }, - 'playlist_count': 9, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - sendtonews_url = SendtoNewsIE._extract_url(webpage) - if sendtonews_url: - return self.url_result( - compat_urlparse.urljoin(url, sendtonews_url), - ie=SendtoNewsIE.ie_key()) - - info_dict = self._extract_anvato_videos(webpage, display_id) - - timestamp = unified_timestamp(self._html_search_regex( - r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, - 'released date', default=None)) or parse_iso8601( - self._html_search_meta('uploadDate', webpage)) - - info_dict.update({ - 'display_id': display_id, - 'timestamp': timestamp, - }) - - return info_dict diff --git a/yt_dlp/extractor/cbsnews.py b/yt_dlp/extractor/cbsnews.py index 16edf3af86..65ecc62f02 100644 --- a/yt_dlp/extractor/cbsnews.py +++ b/yt_dlp/extractor/cbsnews.py @@ -1,36 +1,153 @@ +import base64 import re +import urllib.error +import urllib.parse import zlib +from .anvato import AnvatoIE from .common import InfoExtractor -from .cbs import CBSIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) +from .paramountplus import ParamountPlusIE from ..utils import ( + ExtractorError, + HEADRequest, + UserNotLive, + determine_ext, + float_or_none, + format_field, + int_or_none, + make_archive_id, + mimetype2ext, parse_duration, + smuggle_url, + traverse_obj, + url_or_none, ) -class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsBaseIE(InfoExtractor): + _LOCALES = { + 'atlanta': None, + 'baltimore': 'BAL', + 'boston': 'BOS', + 'chicago': 'CHI', + 'colorado': 'DEN', + 'detroit': 'DET', + 'losangeles': 'LA', + 'miami': 'MIA', + 'minnesota': 'MIN', + 'newyork': 'NY', + 'philadelphia': 'PHI', + 'pittsburgh': 'PIT', + 'sacramento': 'SAC', + 'sanfrancisco': 'SF', + 'texas': 'DAL', + } + _LOCALE_RE = '|'.join(map(re.escape, _LOCALES)) + _ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl' + + def _get_item(self, webpage, display_id): + return traverse_obj(self._search_json( + r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id, + default={}), ('items', 0, {dict})) or {} + + def _get_video_url(self, item): + return traverse_obj(item, 'video', 'video2', expected_type=url_or_none) + + def _extract_playlist(self, webpage, playlist_id): + entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall( + r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)] + if entries: + return self.playlist_result( + entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage), + self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + def _extract_video(self, item, video_url, video_id): + if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4': + formats = [{'url': video_url, 'ext': 'mp4'}] + + else: + manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information') + + anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None) + # Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source + if anvato_id: + return self.url_result( + smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}), + AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + formats, _ = self._parse_m3u8_formats_and_subtitles( + manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id) + + def get_subtitles(subs_url): + return { + 'en': [{ + 'url': subs_url, + 'ext': 'dfxp', # TTAF1 + }], + } if url_or_none(subs_url) else None + + episode_meta = traverse_obj(item, { + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + }) if item.get('isFullEpisode') else {} + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(item, { + 'title': (None, ('fulltitle', 'title')), + 'description': 'dek', + 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration', {float_or_none}), + 'subtitles': ('captions', {get_subtitles}), + 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}), + 'is_live': ('type', {lambda x: x == 'live'}), + }, get_all=False), + **episode_meta, + } + + +class CBSNewsEmbedIE(CBSNewsBaseIE): IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)' _TESTS = [{ 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', - 'only_matching': True, + 'info_dict': { + 'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA', + 'ext': 'mp4', + 'title': 'Cops investigate gorilla incident at Cincinnati Zoo', + 'description': 'md5:fee7441ab8aaeb3c693482394738102b', + 'duration': 350, + 'timestamp': 1464719713, + 'upload_date': '20160531', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - item = self._parse_json(zlib.decompress(compat_b64decode( - compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode( + urllib.parse.unquote(self._match_id(url))), + -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {} + + video_id = item['mpxRefId'] + video_url = self._get_video_url(item) + if not video_url: + # Old embeds redirect user to ParamountPlus but most links are 404 + pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}' + try: + self._request_webpage(HEADRequest(pplus_url), video_id) + return self.url_result(pplus_url, ParamountPlusIE) + except ExtractorError: + self.raise_no_formats('This video is no longer available', True, video_id) + + return self._extract_video(item, video_url, video_id) -class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsIE(CBSNewsBaseIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\w-]+)' _TESTS = [ { @@ -47,10 +164,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'timestamp': 1476046464, 'upload_date': '20161009', }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'skip': 'This video is no longer available', }, { 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -61,48 +175,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'upload_date': '20140404', 'timestamp': 1396650660, - 'uploader': 'CBSI-NEW', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ - 'ext': 'ttml', + 'ext': 'dfxp', }], }, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { + 'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved', 'title': 'Cold as Ice', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, 'playlist_mincount': 7, }, + { + 'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/', + 'info_dict': { + 'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE', + 'ext': 'mp4', + 'title': 'CBS Evening News, March 28, 2023', + 'description': 'md5:db20615aae54adc1d55a1fd69dc75d13', + 'duration': 1189, + 'timestamp': 1680042600, + 'upload_date': '20230328', + 'season': 'Season 2023', + 'season_number': 2023, + 'episode': 'Episode 83', + 'episode_number': 83, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - entries = [] - for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): - entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) - if entries: - return self.playlist_result( - entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), - playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist - item = self._parse_json(self._html_search_regex( - r'CBSNEWS\.defaultPayload\s*=\s*({.+})', - webpage, 'video JSON info'), display_id)['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + video_url = self._get_video_url(item) + if not video_url: + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalBaseIE(CBSNewsBaseIE): + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + anvato_id = None + video_url = self._get_video_url(item) + + if not video_url: + anv_params = self._search_regex( + r'<iframe[^>]+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"', + webpage, 'Anvato URL', default=None) + + if not anv_params: + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id) + anvato_id = anv_data['v'] + return self.url_result( + smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', { + 'token': anv_data.get('token') or 'default', + }), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P<id>[\w-]+)' + _TESTS = [{ + # Anvato video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/', + 'info_dict': { + 'id': '6376747', + 'ext': 'mp4', + 'title': '1st cannabis dispensary opens in Queens', + 'description': 'The dispensary is women-owned and located in Jamaica.', + 'uploader': 'CBS', + 'duration': 20, + 'timestamp': 1680193657, + 'upload_date': '20230330', + 'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'], + 'tags': 'count:11', + 'thumbnail': 're:^https?://.*', + '_old_archive_ids': ['cbslocal 6376747'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # cbsnews.com video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/', + 'info_dict': { + 'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3', + 'ext': 'mp4', + 'title': 'the city is sounding the alarm on dangerous social media challenges', + 'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6', + 'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg', + 'duration': 41.0, + 'timestamp': 1680196615, + 'upload_date': '20230330', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + +class CBSLocalArticleIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P<id>[\w-]+)' + _TESTS = [{ + # Anvato video via iframe embed + 'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service', + 'title': 'MTA station agents begin leaving their booths to provide more direct customer service', + 'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.', + }, + }, { + 'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + }, + 'skip': 'Video has been removed', + }] + + +class CBSNewsLiveBaseIE(CBSNewsBaseIE): + def _get_id(self, url): + raise NotImplementedError('This method must be implemented by subclasses') + + def _real_extract(self, url): + video_id = self._get_id(url) + if not video_id: + raise ExtractorError('Livestream is not available', expected=True) + + data = traverse_obj(self._download_json( + 'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={ + 'partner': 'cbsnsite', + 'edition': video_id, + 'type': 'live', + }), ('navigation', 'data', 0, {dict})) + + video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False) + if not video_url: + raise UserNotLive(video_id=video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(data, { + 'title': 'headline', + 'description': 'rundown_slug', + 'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}), + }), + } + + +class CBSLocalLiveIE(CBSNewsLiveBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P<id>{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/losangeles/live/', + 'info_dict': { + 'id': 'CBSN-LA', + 'ext': 'mp4', + 'title': str, + 'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s') + + +class CBSNewsLiveIE(CBSNewsLiveBaseIE): + IE_NAME = 'cbsnews:live' + IE_DESC = 'CBS News Livestream' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/live/', + 'info_dict': { + 'id': 'CBSN-US', + 'ext': 'mp4', + 'title': str, + 'description': r're:\w+ \w+ CRISPIN RUNDOWN', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return 'CBSN-US' class CBSNewsLiveVideoIE(InfoExtractor): @@ -111,7 +411,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)' # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples - _TEST = { + _TESTS = [{ 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -120,7 +420,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'duration': 334, }, 'skip': 'Video gone', - } + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -131,13 +431,13 @@ def _real_extract(self, url): 'dvr_slug': display_id, }) - formats = self._extract_akamai_formats(video_info['url'], display_id) - return { 'id': display_id, 'display_id': display_id, - 'title': video_info['headline'], - 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), - 'duration': parse_duration(video_info.get('segmentDur')), - 'formats': formats, + 'formats': self._extract_akamai_formats(video_info['url'], display_id), + **traverse_obj(video_info, { + 'title': 'headline', + 'thumbnail': ('thumbnail_url_hd', {url_or_none}), + 'duration': ('segmentDur', {parse_duration}), + }), } From 2d306c03d6f2697fcbabb7da35aa62cc078359d3 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Mon, 29 May 2023 06:17:29 -0400 Subject: [PATCH 133/501] [extractor/rottentomatoes] Fix extractor (#6844) Closes #6729 Authored by: JChris246 --- yt_dlp/extractor/rottentomatoes.py | 82 ++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/rottentomatoes.py b/yt_dlp/extractor/rottentomatoes.py index f133c851be..e35717522b 100644 --- a/yt_dlp/extractor/rottentomatoes.py +++ b/yt_dlp/extractor/rottentomatoes.py @@ -1,30 +1,80 @@ from .common import InfoExtractor -from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_class, + join_nonempty, + traverse_obj, + url_or_none, +) class RottenTomatoesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/(?P<playlist>[^/]+)(?:/(?P<tr>trailers)(?:/(?P<id>\w+))?)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', - 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.' }, - } + 'skip': 'No longer available', + }, { + 'url': 'https://www.rottentomatoes.com/m/toy_story_3/trailers/VycaVoBKhGuk', + 'info_dict': { + 'id': 'VycaVoBKhGuk', + 'ext': 'mp4', + 'title': 'Toy Story 3: Trailer 2', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 149.941 + }, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3', + 'info_dict': { + 'id': 'toy_story_3', + 'title': 'Toy Story 3', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers', + 'info_dict': { + 'id': 'toy_story_3-trailers', + }, + 'playlist_mincount': 5, + }] + + def _extract_videos(self, data, display_id): + for video in traverse_obj(data, (lambda _, v: v['publicId'] and v['file'] and v['type'] == 'hls')): + yield { + 'formats': self._extract_m3u8_formats( + video['file'], display_id, 'mp4', m3u8_id='hls', fatal=False), + **traverse_obj(video, { + 'id': 'publicId', + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {float_or_none}), + 'thumbnail': ('image', {url_or_none}), + }), + } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') + playlist_id, trailers, video_id = self._match_valid_url(url).group('playlist', 'tr', 'id') + playlist_id = join_nonempty(playlist_id, trailers) + webpage = self._download_webpage(url, playlist_id) + data = self._search_json( + r'<script[^>]+\bid=["\'](?:heroV|v)ideos["\'][^>]*>', webpage, + 'data', playlist_id, contains_pattern=r'\[{(?s:.+)}\]') - return { - '_type': 'url_transparent', - 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, - 'ie_key': InternetVideoArchiveIE.ie_key(), - 'id': video_id, - 'title': self._og_search_title(webpage), - } + if video_id: + video_data = traverse_obj(data, lambda _, v: v['publicId'] == video_id) + if not video_data: + raise ExtractorError('Unable to extract video from webpage') + return next(self._extract_videos(video_data, video_id)) + + return self.playlist_result( + self._extract_videos(data, playlist_id), playlist_id, + clean_html(get_element_by_class('scoreboard__title', webpage))) From 489f51279d00318018478fd7461eddbe3b45297e Mon Sep 17 00:00:00 2001 From: hasezoey <hasezoey@gmail.com> Date: Mon, 29 May 2023 12:52:01 +0200 Subject: [PATCH 134/501] [extractor/nekohacker] Add extractor (#7003) Authored by: hasezoey --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/nekohacker.py | 217 ++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+) create mode 100644 yt_dlp/extractor/nekohacker.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 31af6bd3f8..f9fa84c43b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1206,6 +1206,7 @@ NebulaSubscriptionsIE, NebulaChannelIE, ) +from .nekohacker import NekoHackerIE from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( diff --git a/yt_dlp/extractor/nekohacker.py b/yt_dlp/extractor/nekohacker.py new file mode 100644 index 0000000000..e10ffe925a --- /dev/null +++ b/yt_dlp/extractor/nekohacker.py @@ -0,0 +1,217 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + extract_attributes, + get_element_by_class, + get_element_text_and_html_by_tag, + parse_duration, + traverse_obj, + try_call, + url_or_none, +) + + +class NekoHackerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nekohacker\.com/(?P<id>(?!free-dl)[\w-]+)' + _TESTS = [{ + 'url': 'https://nekohacker.com/nekoverse/', + 'info_dict': { + 'id': 'nekoverse', + 'title': 'Nekoverse', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/01-Spaceship.mp3', + 'md5': '44223701ebedba0467ebda4cc07fb3aa', + 'info_dict': { + 'id': '1712', + 'ext': 'mp3', + 'title': 'Spaceship', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Spaceship', + 'track_number': 1, + 'duration': 195.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/02-City-Runner.mp3', + 'md5': '8f853c71719389d32bbbd3f1a87b3f08', + 'info_dict': { + 'id': '1713', + 'ext': 'mp3', + 'title': 'City Runner', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'City Runner', + 'track_number': 2, + 'duration': 148.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/03-Nature-Talk.mp3', + 'md5': '5a8a8ae852720cee4c0ac95c7d1a7450', + 'info_dict': { + 'id': '1714', + 'ext': 'mp3', + 'title': 'Nature Talk', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Nature Talk', + 'track_number': 3, + 'duration': 174.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/04-Crystal-World.mp3', + 'md5': 'd8e59a48061764e50d92386a294abd50', + 'info_dict': { + 'id': '1715', + 'ext': 'mp3', + 'title': 'Crystal World', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Crystal World', + 'track_number': 4, + 'duration': 199.0 + } + } + ] + }, { + 'url': 'https://nekohacker.com/susume/', + 'info_dict': { + 'id': 'susume', + 'title': '進め!むじなカンパニー', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-feat.-六科なじむ-CV_-日高里菜-割戶真友-CV_-金元寿子-軽井沢ユキ-CV_-上坂すみれ-出稼ぎガルシア-CV_-金子彩花-.mp3', + 'md5': 'fb13f008aa81f26ba48f91fd2d6186ce', + 'info_dict': { + 'id': '711', + 'ext': 'mp3', + 'title': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'track_number': 1, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-feat.-六科なじむ-CV_-日高里菜-.mp3', + 'md5': '028803f70241df512b7764e73396fdd1', + 'info_dict': { + 'id': '709', + 'ext': 'mp3', + 'title': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'track_number': 2, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-instrumental.mp3', + 'md5': 'adde9e9a16e1da5e602b579c247d0fb9', + 'info_dict': { + 'id': '710', + 'ext': 'mp3', + 'title': '進め!むじなカンパニー (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': '進め!むじなカンパニー (instrumental)', + 'track_number': 3, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-instrumental.mp3', + 'md5': 'ebb0443039cf5f9ff7fd557ed9b23599', + 'info_dict': { + 'id': '712', + 'ext': 'mp3', + 'title': 'むじな de なじむ (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ (instrumental)', + 'track_number': 4, + 'duration': None + } + } + ] + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + playlist = get_element_by_class('playlist', webpage) + + if not playlist: + iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or '' + iframe_src = url_or_none(extract_attributes(iframe).get('src')) + if not iframe_src: + raise ExtractorError('No playlist or embed found in webpage') + elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src): + raise ExtractorError('Spotify embeds are not supported', expected=True) + return self.url_result(url, 'Generic') + + entries = [] + for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1): + entry = traverse_obj(extract_attributes(track), { + 'url': ('data-audiopath', {url_or_none}), + 'ext': ('data-audiopath', {determine_ext}), + 'id': 'data-trackid', + 'title': 'data-tracktitle', + 'track': 'data-tracktitle', + 'album': 'data-albumtitle', + 'duration': ('data-tracktime', {parse_duration}), + 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'thumbnail': ('data-albumart', {url_or_none}), + }) + entries.append({ + **entry, + 'track_number': track_number, + 'artist': 'Neko Hacker', + 'vcodec': 'none', + 'acodec': 'mp3' if entry['ext'] == 'mp3' else None, + }) + + return self.playlist_result(entries, playlist_id, traverse_obj(entries, (0, 'album'))) From b38d4c941d1993ab27e4c0f8e024e23c2ec0f8f8 Mon Sep 17 00:00:00 2001 From: Matt Broadway <mattdbway@gmail.com> Date: Mon, 29 May 2023 14:51:35 +0100 Subject: [PATCH 135/501] [cookies] Update for chromium changes (#6897) Authored by: mbway --- test/test_cookies.py | 18 +++-- yt_dlp/cookies.py | 187 +++++++++++++++++++++++++++++++------------ 2 files changed, 150 insertions(+), 55 deletions(-) diff --git a/test/test_cookies.py b/test/test_cookies.py index 4155bcbf55..5282ef6215 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -49,32 +49,38 @@ def test_get_desktop_environment(self): """ based on https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util_unittest.cc """ test_cases = [ ({}, _LinuxDesktopEnvironment.OTHER), + ({'DESKTOP_SESSION': 'my_custom_de'}, _LinuxDesktopEnvironment.OTHER), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de'}, _LinuxDesktopEnvironment.OTHER), ({'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME), ({'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME), - ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE), - ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE), + ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), + ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), - ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE), + ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE3), + ({'KDE_FULL_SESSION': 1, 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), ({'XDG_CURRENT_DESKTOP': 'X-Cinnamon'}, _LinuxDesktopEnvironment.CINNAMON), + ({'XDG_CURRENT_DESKTOP': 'Deepin'}, _LinuxDesktopEnvironment.DEEPIN), ({'XDG_CURRENT_DESKTOP': 'GNOME'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'GNOME:GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'GNOME : GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), ({'XDG_CURRENT_DESKTOP': 'Unity', 'DESKTOP_SESSION': 'gnome-fallback'}, _LinuxDesktopEnvironment.GNOME), - ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE), - ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE), + ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE5), + ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '6'}, _LinuxDesktopEnvironment.KDE6), + ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE4), ({'XDG_CURRENT_DESKTOP': 'Pantheon'}, _LinuxDesktopEnvironment.PANTHEON), + ({'XDG_CURRENT_DESKTOP': 'UKUI'}, _LinuxDesktopEnvironment.UKUI), ({'XDG_CURRENT_DESKTOP': 'Unity'}, _LinuxDesktopEnvironment.UNITY), ({'XDG_CURRENT_DESKTOP': 'Unity:Unity7'}, _LinuxDesktopEnvironment.UNITY), ({'XDG_CURRENT_DESKTOP': 'Unity:Unity8'}, _LinuxDesktopEnvironment.UNITY), ] for env, expected_desktop_environment in test_cases: - self.assertEqual(_get_linux_desktop_environment(env), expected_desktop_environment) + self.assertEqual(_get_linux_desktop_environment(env, Logger()), expected_desktop_environment) def test_chrome_cookie_decryptor_linux_derive_key(self): key = LinuxChromeCookieDecryptor.derive_key(b'abc') diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index ee2af0f704..e46d193416 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -353,7 +353,9 @@ class ChromeCookieDecryptor: Linux: - cookies are either v10 or v11 - v10: AES-CBC encrypted with a fixed key + - also attempts empty password if decryption fails - v11: AES-CBC encrypted with an OS protected key (keyring) + - also attempts empty password if decryption fails - v11 keys can be stored in various places depending on the activate desktop environment [2] Mac: @@ -368,7 +370,7 @@ class ChromeCookieDecryptor: Sources: - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/ - - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_linux.cc + - [2] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_linux.cc - KeyStorageLinux::CreateService """ @@ -390,6 +392,7 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_keyring_name, logger, *, keyring=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') + self._empty_key = self.derive_key(b'') self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} self._browser_keyring_name = browser_keyring_name self._keyring = keyring @@ -402,25 +405,36 @@ def _v11_key(self): @staticmethod def derive_key(password): # values from - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) def decrypt(self, encrypted_value): + """ + + following the same approach as the fix in [1]: if cookies fail to decrypt then attempt to decrypt + with an empty password. The failure detection is not the same as what chromium uses so the + results won't be perfect + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/ + - a bugfix to try an empty password as a fallback + """ version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': self._cookie_counts['v10'] += 1 - return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger) elif version == b'v11': self._cookie_counts['v11'] += 1 if self._v11_key is None: self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger) else: + self._logger.warning(f'unknown cookie version: "{version}"', only_once=True) self._cookie_counts['other'] += 1 return None @@ -435,7 +449,7 @@ def __init__(self, browser_keyring_name, logger): @staticmethod def derive_key(password): # values from - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) def decrypt(self, encrypted_value): @@ -448,12 +462,12 @@ def decrypt(self, encrypted_value): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) + return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger) else: self._cookie_counts['other'] += 1 # other prefixes are considered 'old data' which were stored as plaintext - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_mac.mm return encrypted_value @@ -473,7 +487,7 @@ def decrypt(self, encrypted_value): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc # kNonceLength nonce_length = 96 // 8 # boringssl @@ -490,7 +504,7 @@ def decrypt(self, encrypted_value): else: self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted - # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc + # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc return _decrypt_windows_dpapi(encrypted_value, self._logger).decode() @@ -673,27 +687,35 @@ class _LinuxDesktopEnvironment(Enum): """ OTHER = auto() CINNAMON = auto() + DEEPIN = auto() GNOME = auto() - KDE = auto() + KDE3 = auto() + KDE4 = auto() + KDE5 = auto() + KDE6 = auto() PANTHEON = auto() + UKUI = auto() UNITY = auto() XFCE = auto() + LXQT = auto() class _LinuxKeyring(Enum): """ - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.h SelectedLinuxBackend """ - KWALLET = auto() - GNOMEKEYRING = auto() - BASICTEXT = auto() + KWALLET4 = auto() # this value is just called KWALLET in the chromium source but it is for KDE4 only + KWALLET5 = auto() + KWALLET6 = auto() + GNOME_KEYRING = auto() + BASIC_TEXT = auto() SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() -def _get_linux_desktop_environment(env): +def _get_linux_desktop_environment(env, logger): """ https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc GetDesktopEnvironment @@ -708,51 +730,97 @@ def _get_linux_desktop_environment(env): return _LinuxDesktopEnvironment.GNOME else: return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'Deepin': + return _LinuxDesktopEnvironment.DEEPIN elif xdg_current_desktop == 'GNOME': return _LinuxDesktopEnvironment.GNOME elif xdg_current_desktop == 'X-Cinnamon': return _LinuxDesktopEnvironment.CINNAMON elif xdg_current_desktop == 'KDE': - return _LinuxDesktopEnvironment.KDE + kde_version = env.get('KDE_SESSION_VERSION', None) + if kde_version == '5': + return _LinuxDesktopEnvironment.KDE5 + elif kde_version == '6': + return _LinuxDesktopEnvironment.KDE6 + elif kde_version == '4': + return _LinuxDesktopEnvironment.KDE4 + else: + logger.info(f'unknown KDE version: "{kde_version}". Assuming KDE4') + return _LinuxDesktopEnvironment.KDE4 elif xdg_current_desktop == 'Pantheon': return _LinuxDesktopEnvironment.PANTHEON elif xdg_current_desktop == 'XFCE': return _LinuxDesktopEnvironment.XFCE + elif xdg_current_desktop == 'UKUI': + return _LinuxDesktopEnvironment.UKUI + elif xdg_current_desktop == 'LXQt': + return _LinuxDesktopEnvironment.LXQT + else: + logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + elif desktop_session is not None: - if desktop_session in ('mate', 'gnome'): + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): return _LinuxDesktopEnvironment.GNOME - elif 'kde' in desktop_session: - return _LinuxDesktopEnvironment.KDE - elif 'xfce' in desktop_session: + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI + else: + logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + else: if 'GNOME_DESKTOP_SESSION_ID' in env: return _LinuxDesktopEnvironment.GNOME elif 'KDE_FULL_SESSION' in env: - return _LinuxDesktopEnvironment.KDE + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 return _LinuxDesktopEnvironment.OTHER def _choose_linux_keyring(logger): """ - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc - SelectBackend + SelectBackend in [1] + + There is currently support for forcing chromium to use BASIC_TEXT by creating a file called + `Disable Local Encryption` [1] in the user data dir. The function to write this file (`WriteBackendUse()` [1]) + does not appear to be called anywhere other than in tests, so the user would have to create this file manually + and so would be aware enough to tell yt-dlp to use the BASIC_TEXT keyring. + + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.cc """ - desktop_environment = _get_linux_desktop_environment(os.environ) + desktop_environment = _get_linux_desktop_environment(os.environ, logger) logger.debug(f'detected desktop environment: {desktop_environment.name}') - if desktop_environment == _LinuxDesktopEnvironment.KDE: - linux_keyring = _LinuxKeyring.KWALLET - elif desktop_environment == _LinuxDesktopEnvironment.OTHER: - linux_keyring = _LinuxKeyring.BASICTEXT + if desktop_environment == _LinuxDesktopEnvironment.KDE4: + linux_keyring = _LinuxKeyring.KWALLET4 + elif desktop_environment == _LinuxDesktopEnvironment.KDE5: + linux_keyring = _LinuxKeyring.KWALLET5 + elif desktop_environment == _LinuxDesktopEnvironment.KDE6: + linux_keyring = _LinuxKeyring.KWALLET6 + elif desktop_environment in ( + _LinuxDesktopEnvironment.KDE3, _LinuxDesktopEnvironment.LXQT, _LinuxDesktopEnvironment.OTHER + ): + linux_keyring = _LinuxKeyring.BASIC_TEXT else: - linux_keyring = _LinuxKeyring.GNOMEKEYRING + linux_keyring = _LinuxKeyring.GNOME_KEYRING return linux_keyring -def _get_kwallet_network_wallet(logger): +def _get_kwallet_network_wallet(keyring, logger): """ The name of the wallet used to store network passwords. - https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/kwallet_dbus.cc KWalletDBus::NetworkWallet which does a dbus call to the following function: https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html @@ -760,10 +828,22 @@ def _get_kwallet_network_wallet(logger): """ default_wallet = 'kdewallet' try: + if keyring == _LinuxKeyring.KWALLET4: + service_name = 'org.kde.kwalletd' + wallet_path = '/modules/kwalletd' + elif keyring == _LinuxKeyring.KWALLET5: + service_name = 'org.kde.kwalletd5' + wallet_path = '/modules/kwalletd5' + elif keyring == _LinuxKeyring.KWALLET6: + service_name = 'org.kde.kwalletd6' + wallet_path = '/modules/kwalletd6' + else: + raise ValueError(keyring) + stdout, _, returncode = Popen.run([ 'dbus-send', '--session', '--print-reply=literal', - '--dest=org.kde.kwalletd5', - '/modules/kwalletd5', + f'--dest={service_name}', + wallet_path, 'org.kde.KWallet.networkWallet' ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) @@ -778,8 +858,8 @@ def _get_kwallet_network_wallet(logger): return default_wallet -def _get_kwallet_password(browser_keyring_name, logger): - logger.debug('using kwallet-query to obtain password from kwallet') +def _get_kwallet_password(browser_keyring_name, keyring, logger): + logger.debug(f'using kwallet-query to obtain password from {keyring.name}') if shutil.which('kwallet-query') is None: logger.error('kwallet-query command not found. KWallet and kwallet-query ' @@ -787,7 +867,7 @@ def _get_kwallet_password(browser_keyring_name, logger): 'included in the kwallet package for your distribution') return b'' - network_wallet = _get_kwallet_network_wallet(logger) + network_wallet = _get_kwallet_network_wallet(keyring, logger) try: stdout, _, returncode = Popen.run([ @@ -809,8 +889,9 @@ def _get_kwallet_password(browser_keyring_name, logger): # checks hasEntry. To verify this: # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" # while starting chrome. - # this may be a bug as the intended behaviour is to generate a random password and store - # it, but that doesn't matter here. + # this was identified as a bug later and fixed in + # https://chromium.googlesource.com/chromium/src/+/bbd54702284caca1f92d656fdcadf2ccca6f4165%5E%21/#F0 + # https://chromium.googlesource.com/chromium/src/+/5463af3c39d7f5b6d11db7fbd51e38cc1974d764 return b'' else: logger.debug('password found') @@ -848,11 +929,11 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger) logger.debug(f'Chosen keyring: {keyring.name}') - if keyring == _LinuxKeyring.KWALLET: - return _get_kwallet_password(browser_keyring_name, logger) - elif keyring == _LinuxKeyring.GNOMEKEYRING: + if keyring in (_LinuxKeyring.KWALLET4, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6): + return _get_kwallet_password(browser_keyring_name, keyring, logger) + elif keyring == _LinuxKeyring.GNOME_KEYRING: return _get_gnome_keyring_password(browser_keyring_name, logger) - elif keyring == _LinuxKeyring.BASICTEXT: + elif keyring == _LinuxKeyring.BASIC_TEXT: # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required) return None assert False, f'Unknown keyring {keyring}' @@ -877,6 +958,10 @@ def _get_mac_keyring_password(browser_keyring_name, logger): def _get_windows_v10_key(browser_root, logger): + """ + References: + - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc + """ path = _find_most_recently_used_file(browser_root, 'Local State', logger) if path is None: logger.error('could not find local state file') @@ -885,11 +970,13 @@ def _get_windows_v10_key(browser_root, logger): with open(path, encoding='utf8') as f: data = json.load(f) try: + # kOsCryptEncryptedKeyPrefName in [1] base64_key = data['os_crypt']['encrypted_key'] except KeyError: logger.error('no encrypted key in Local State') return None encrypted_key = base64.b64decode(base64_key) + # kDPAPIKeyPrefix in [1] prefix = b'DPAPI' if not encrypted_key.startswith(prefix): logger.error('invalid key') @@ -901,13 +988,15 @@ def pbkdf2_sha1(password, salt, iterations, key_length): return pbkdf2_hmac('sha1', password, salt, iterations, key_length) -def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): - plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) - try: - return plaintext.decode() - except UnicodeDecodeError: - logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) - return None +def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16): + for key in keys: + plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) + try: + return plaintext.decode() + except UnicodeDecodeError: + pass + logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) + return None def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): From f78eb41e1c0f1dcdb10317358a26bf541dc7ee15 Mon Sep 17 00:00:00 2001 From: Stefan Lobbenmeier <Stefan.Lobbenmeier@gmail.com> Date: Mon, 29 May 2023 16:28:14 +0200 Subject: [PATCH 136/501] [extractor/ARDBetaMediathek] Add thumbnail (#6890) Closes #6889 Authored by: StefanLobbenmeier --- yt_dlp/extractor/ard.py | 44 +++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 8660741ce4..ca1faa7d0b 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -13,6 +13,7 @@ try_get, unified_strdate, unified_timestamp, + update_url, update_url_query, url_or_none, xpath_text, @@ -408,6 +409,23 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' _TESTS = [{ + 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', + 'md5': '3fd5fead7a370a819341129c8d713136', + 'info_dict': { + 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', + 'id': '12172961', + 'title': 'Wolfsland - Die traurigen Schwestern', + 'description': r're:^Als der Polizeiobermeister Raaben', + 'duration': 5241, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', + 'timestamp': 1670710500, + 'upload_date': '20221210', + 'ext': 'mp4', + 'age_limit': 12, + 'episode': 'Wolfsland - Die traurigen Schwestern', + 'series': 'Filme im MDR' + }, + }, { 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { @@ -424,7 +442,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'skip': 'Error', }, { 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', - 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'md5': '1e73ded21cb79bac065117e80c81dc88', 'info_dict': { 'id': '10049223', 'ext': 'mp4', @@ -432,13 +450,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'timestamp': 1636398000, 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', 'upload_date': '20211108', - }, - }, { - 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', - 'playlist_count': 6, - 'info_dict': { - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', - 'title': 'beforeigners/beforeigners/staffel-1', + 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste', + 'duration': 915, + 'episode': 'tagesschau, 20:00 Uhr', + 'series': 'tagesschau', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', @@ -602,6 +618,9 @@ def _real_extract(self, url): show { title } + image { + src + } synopsis title tracking { @@ -640,6 +659,15 @@ def _real_extract(self, url): 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), 'series': try_get(player_page, lambda x: x['show']['title']), + 'thumbnail': (media_collection.get('_previewImage') + or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None)) + or self.get_thumbnail_from_html(display_id, url)), }) info.update(self._ARD_extract_episode_info(info['title'])) return info + + def get_thumbnail_from_html(self, display_id, url): + webpage = self._download_webpage(url, display_id, fatal=False) or '' + return ( + self._og_search_thumbnail(webpage, default=None) + or self._html_search_meta('thumbnailUrl', webpage, default=None)) From dbce5afa6bb61f6272ade613f2e9a3d66b88c7ea Mon Sep 17 00:00:00 2001 From: Florian Albrechtskirchinger <falbrechtskirchinger@gmail.com> Date: Mon, 29 May 2023 16:30:20 +0200 Subject: [PATCH 137/501] [extractor/twitch:vod] Support links from schedule tab (#7071) Authored by: falbrechtskirchinger --- yt_dlp/extractor/twitch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index d7a1cc531a..4a17d80489 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -194,7 +194,8 @@ class TwitchVodIE(TwitchBaseIE): https?:// (?: (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| - player\.twitch\.tv/\?.*?\bvideo=v? + player\.twitch\.tv/\?.*?\bvideo=v?| + www\.twitch\.tv/[^/]+/schedule\?vodID= ) (?P<id>\d+) ''' @@ -363,6 +364,9 @@ class TwitchVodIE(TwitchBaseIE): 'skip_download': True }, 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 403: Forbidden'] + }, { + 'url': 'https://www.twitch.tv/tangotek/schedule?vodID=1822395420', + 'only_matching': True, }] def _download_info(self, item_id): From 45e87ea106ad37b2a002663fa30ee41ce97b16cd Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Mon, 29 May 2023 23:31:22 +0900 Subject: [PATCH 138/501] [extractor/eurosport] Improve `_VALID_URL` (#7076) Closes #7042 Authored by: HobbyistDev --- yt_dlp/extractor/eurosport.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/eurosport.py b/yt_dlp/extractor/eurosport.py index 654e112064..6c426bb89c 100644 --- a/yt_dlp/extractor/eurosport.py +++ b/yt_dlp/extractor/eurosport.py @@ -3,7 +3,7 @@ class EurosportIE(InfoExtractor): - _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)' + _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?P<id>vid\d+)' _TESTS = [{ 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', 'info_dict': { @@ -44,6 +44,32 @@ class EurosportIE(InfoExtractor): 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', 'upload_date': '20220727', } + }, { + 'url': 'https://www.eurosport.com/football/champions-league/2022-2023/pep-guardiola-emotionally-destroyed-after-manchester-city-win-over-bayern-munich-in-champions-league_vid1896254/video.shtml', + 'info_dict': { + 'id': '3096477', + 'ext': 'mp4', + 'title': 'md5:82edc17370124c7a19b3cf518517583b', + 'duration': 84.0, + 'description': 'md5:b3f44ef7f5b5b95b24a273b163083feb', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/04/12/3682873-74947393-2560-1440.jpg', + 'timestamp': 1681292028, + 'upload_date': '20230412', + 'display_id': 'vid1896254', + } + }, { + 'url': 'https://www.eurosport.com/football/last-year-s-semi-final-pain-was-still-there-pep-guardiola-after-man-city-reach-cl-final_vid1914115/video.shtml', + 'info_dict': { + 'id': '3149108', + 'ext': 'mp4', + 'title': '\'Last year\'s semi-final pain was still there\' - Pep Guardiola after Man City reach CL final', + 'description': 'md5:89ef142fe0170a66abab77fac2955d8e', + 'display_id': 'vid1914115', + 'timestamp': 1684403618, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/05/18/3707254-75435008-2560-1440.jpg', + 'duration': 105.0, + 'upload_date': '20230518', + } }] _TOKEN = None From 4cbfa570a1b9bd65b0f48770693377e8d842dcb0 Mon Sep 17 00:00:00 2001 From: garret <76261416+garret1317@users.noreply.github.com> Date: Mon, 29 May 2023 15:44:26 +0100 Subject: [PATCH 139/501] [extractor/camfm] Add extractors (#7083) Authored by: garret1317 --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/camfm.py | 85 +++++++++++++++++++++++++++++++++ yt_dlp/utils/_utils.py | 1 + 3 files changed, 90 insertions(+) create mode 100644 yt_dlp/extractor/camfm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f9fa84c43b..73dcf4e0ee 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -284,6 +284,10 @@ CamdemyIE, CamdemyFolderIE ) +from .camfm import ( + CamFMEpisodeIE, + CamFMShowIE +) from .cammodels import CamModelsIE from .camsoda import CamsodaIE from .camtasia import CamtasiaEmbedIE diff --git a/yt_dlp/extractor/camfm.py b/yt_dlp/extractor/camfm.py new file mode 100644 index 0000000000..a9850f46e0 --- /dev/null +++ b/yt_dlp/extractor/camfm.py @@ -0,0 +1,85 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + get_elements_by_class, + join_nonempty, + traverse_obj, + unified_timestamp, + urljoin, +) + + +class CamFMShowIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/shows/(?P<id>[^/]+)' + _TESTS = [{ + 'playlist_mincount': 5, + 'url': 'https://camfm.co.uk/shows/soul-mining/', + 'info_dict': { + 'id': 'soul-mining', + 'thumbnail': 'md5:6a873091f92c936f23bdcce80f75e66a', + 'title': 'Soul Mining', + 'description': 'Telling the stories of jazz, funk and soul from all corners of the world.', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + page = self._download_webpage(url, show_id) + + return { + '_type': 'playlist', + 'id': show_id, + 'entries': [self.url_result(urljoin('https://camfm.co.uk', i), CamFMEpisodeIE) + for i in re.findall(r"javascript:popup\('(/player/[^']+)', 'listen'", page)], + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r'<img[^>]+class="thumb-expand"[^>]+src="([^"]+)"', page, 'thumbnail', fatal=False)), + 'title': self._html_search_regex('<h1>([^<]+)</h1>', page, 'title', fatal=False), + 'description': clean_html(get_element_by_class('small-12 medium-8 cell', page)) + } + + +class CamFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/player/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://camfm.co.uk/player/43336', + 'skip': 'Episode will expire - don\'t actually know when, but it will go eventually', + 'info_dict': { + 'id': '43336', + 'title': 'AITAA: Am I the Agony Aunt? - 19:00 Tue 16/05/2023', + 'ext': 'mp3', + 'upload_date': '20230516', + 'description': 'md5:f165144f94927c0f1bfa2ee6e6ab7bbf', + 'timestamp': 1684263600, + 'series': 'AITAA: Am I the Agony Aunt?', + 'thumbnail': 'md5:5980a831360d0744c3764551be3d09c1', + 'categories': ['Entertainment'], + } + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + page = self._download_webpage(url, episode_id) + audios = self._parse_html5_media_entries('https://audio.camfm.co.uk', page, episode_id) + + caption = get_element_by_class('caption', page) + series = clean_html(re.sub(r'<span[^<]+<[^<]+>', '', caption)) + + card_section = get_element_by_class('card-section', page) + date = self._html_search_regex('>Aired at ([^<]+)<', card_section, 'air date', fatal=False) + + return { + 'id': episode_id, + 'title': join_nonempty(series, date, delim=' - '), + 'formats': traverse_obj(audios, (..., 'formats', ...)), + 'timestamp': unified_timestamp(date), # XXX: Does not account for UK's daylight savings + 'series': series, + 'description': clean_html(re.sub(r'<b>[^<]+</b><br[^>]+/>', '', card_section)), + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r'<div[^>]+class="cover-art"[^>]+style="[^"]+url\(\'([^\']+)', + page, 'thumbnail', fatal=False)), + 'categories': get_elements_by_class('label', caption), + 'was_live': True, + } diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 7c91faff86..4179d58c16 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -223,6 +223,7 @@ def IDENTITY(x): '%d/%m/%y', '%d/%m/%Y %H:%M:%S', '%d-%m-%Y %H:%M', + '%H:%M %d/%m/%Y', ]) DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) From 372a0f3b9dadd1e52234b498aa4c7040ef868c7d Mon Sep 17 00:00:00 2001 From: Ivan Skodje <ivanskodje@users.noreply.github.com> Date: Mon, 29 May 2023 16:50:21 +0200 Subject: [PATCH 140/501] Auto-select default format in `-f-` (#7101) Authored by: ivanskodje, pukkandan Closes #6720 --- yt_dlp/YoutubeDL.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f49dbf07da..b4923920fc 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2744,21 +2744,22 @@ def is_wellformed(f): return info_dict format_selector = self.format_selector - if format_selector is None: - req_format = self._default_format_spec(info_dict, download=download) - self.write_debug('Default format spec: %s' % req_format) - format_selector = self.build_format_selector(req_format) - while True: if interactive_format_selection: - req_format = input( - self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS)) + req_format = input(self._format_screen('\nEnter format selector ', self.Styles.EMPHASIS) + + '(Press ENTER for default, or Ctrl+C to quit)' + + self._format_screen(': ', self.Styles.EMPHASIS)) try: - format_selector = self.build_format_selector(req_format) + format_selector = self.build_format_selector(req_format) if req_format else None except SyntaxError as err: self.report_error(err, tb=False, is_error=False) continue + if format_selector is None: + req_format = self._default_format_spec(info_dict, download=download) + self.write_debug(f'Default format spec: {req_format}') + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector({ 'formats': formats, 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), From 937264419f9bf375d5656785ae6e53282587c15d Mon Sep 17 00:00:00 2001 From: Ivan Skodje <ivanskodje@users.noreply.github.com> Date: Mon, 29 May 2023 16:53:35 +0200 Subject: [PATCH 141/501] [extractor/tvplay] Remove outdated domains (#7106) Closes #3920 Authored by: ivanskodje --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/tvplay.py | 213 +------------------------------- 2 files changed, 1 insertion(+), 213 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 73dcf4e0ee..2963998cb6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2098,7 +2098,6 @@ ) from .tvplay import ( TVPlayIE, - ViafreeIE, TVPlayHomeIE, ) from .tvplayer import TVPlayerIE diff --git a/yt_dlp/extractor/tvplay.py b/yt_dlp/extractor/tvplay.py index 9ef4f962c5..e056f9872c 100644 --- a/yt_dlp/extractor/tvplay.py +++ b/yt_dlp/extractor/tvplay.py @@ -30,10 +30,7 @@ class TVPlayIE(InfoExtractor): (?: tvplay(?:\.skaties)?\.lv(?:/parraides)?| (?:tv3play|play\.tv3)\.lt(?:/programos)?| - tv3play(?:\.tv3)?\.ee/sisu| - (?:tv(?:3|6|8|10)play)\.se/program| - (?:(?:tv3play|viasat4play|tv6play)\.no|(?:tv3play)\.dk)/programmer| - play\.nova(?:tv)?\.bg/programi + tv3play(?:\.tv3)?\.ee/sisu ) /(?:[^/]+/)+ ) @@ -92,117 +89,6 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, - { - 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', - 'info_dict': { - 'id': '395385', - 'ext': 'mp4', - 'title': 'Husräddarna S02E07', - 'description': 'md5:f210c6c89f42d4fc39faa551be813777', - 'duration': 2574, - 'timestamp': 1400596321, - 'upload_date': '20140520', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', - 'info_dict': { - 'id': '266636', - 'ext': 'mp4', - 'title': 'Den sista dokusåpan S01E08', - 'description': 'md5:295be39c872520221b933830f660b110', - 'duration': 1492, - 'timestamp': 1330522854, - 'upload_date': '20120229', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', - 'info_dict': { - 'id': '282756', - 'ext': 'mp4', - 'title': 'Antikjakten S01E10', - 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', - 'duration': 2646, - 'timestamp': 1348575868, - 'upload_date': '20120925', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', - 'info_dict': { - 'id': '230898', - 'ext': 'mp4', - 'title': 'Anna Anka søker assistent - Ep. 8', - 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', - 'duration': 2656, - 'timestamp': 1277720005, - 'upload_date': '20100628', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', - 'info_dict': { - 'id': '21873', - 'ext': 'mp4', - 'title': 'Budbringerne program 10', - 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', - 'duration': 1297, - 'timestamp': 1254205102, - 'upload_date': '20090929', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', - 'info_dict': { - 'id': '361883', - 'ext': 'mp4', - 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', - 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', - 'duration': 2594, - 'timestamp': 1393236292, - 'upload_date': '20140224', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', - 'info_dict': { - 'id': '624952', - 'ext': 'flv', - 'title': 'Здравей, България (12.06.2015 г.) ', - 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', - 'duration': 8838, - 'timestamp': 1434100372, - 'upload_date': '20150612', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', - 'only_matching': True, - }, { 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, @@ -327,103 +213,6 @@ def _real_extract(self, url): } -class ViafreeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - viafree\.(?P<country>dk|no|se|fi) - /(?P<id>(?:program(?:mer)?|ohjelmat)?/(?:[^/]+/)+[^/?#&]+) - ''' - _TESTS = [{ - 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', - 'info_dict': { - 'id': '757786', - 'ext': 'mp4', - 'title': 'Det beste vorspielet - Sesong 2 - Episode 1', - 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3', - 'series': 'Det beste vorspielet', - 'season_number': 2, - 'duration': 1116, - 'timestamp': 1471200600, - 'upload_date': '20160814', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.viafree.dk/programmer/humor/comedy-central-roast-of-charlie-sheen/film/1047660', - 'info_dict': { - 'id': '1047660', - 'ext': 'mp4', - 'title': 'Comedy Central Roast of Charlie Sheen - Comedy Central Roast of Charlie Sheen', - 'description': 'md5:ec956d941ae9fd7c65a48fd64951dc6d', - 'series': 'Comedy Central Roast of Charlie Sheen', - 'season_number': 1, - 'duration': 3747, - 'timestamp': 1608246060, - 'upload_date': '20201217' - }, - 'params': { - 'skip_download': True - } - }, { - # with relatedClips - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', - 'only_matching': True, - }, { - # Different og:image URL schema - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', - 'only_matching': True, - }, { - 'url': 'https://www.viafree.fi/ohjelmat/entertainment/amazing-makeovers/kausi-7/jakso-2', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - country, path = self._match_valid_url(url).groups() - content = self._download_json( - 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path) - program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program'] - guid = program['guid'] - meta = content['meta'] - title = meta['title'] - - try: - stream_href = self._download_json( - program['_links']['streamLink']['href'], guid, - headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_geo_restricted(countries=[country]) - raise - - formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4') - episode = program.get('episode') or {} - return { - 'id': guid, - 'title': title, - 'thumbnail': meta.get('image'), - 'description': meta.get('description'), - 'series': episode.get('seriesTitle'), - 'subtitles': subtitles, - 'episode_number': int_or_none(episode.get('episodeNumber')), - 'season_number': int_or_none(episode.get('seasonNumber')), - 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000), - 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])), - 'formats': formats, - } - - class TVPlayHomeIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// From dc3c44f349ba85af320e706e2a27ad81a78b1c6e Mon Sep 17 00:00:00 2001 From: Mohamed Al Mehairbi <62325490+ItzMaxTV@users.noreply.github.com> Date: Mon, 29 May 2023 19:19:13 +0400 Subject: [PATCH 142/501] [extractor/Mzaalo] Add extractor (#7163) Authored by: ItzMaxTV --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mzaalo.py | 92 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 yt_dlp/extractor/mzaalo.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2963998cb6..5b4ed44ef4 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1162,6 +1162,7 @@ ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE +from .mzaalo import MzaaloIE from .n1 import ( N1InfoAssetIE, N1InfoIIE, diff --git a/yt_dlp/extractor/mzaalo.py b/yt_dlp/extractor/mzaalo.py new file mode 100644 index 0000000000..c6f420ceaa --- /dev/null +++ b/yt_dlp/extractor/mzaalo.py @@ -0,0 +1,92 @@ +from .common import InfoExtractor +from ..utils import ( + parse_age_limit, + parse_duration, + traverse_obj, + url_or_none, +) + + +class MzaaloIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mzaalo\.com/play/(?P<type>movie|original|clip)/(?P<id>[a-fA-F0-9-]+)/[\w-]+' + _TESTS = [{ + # Movies + 'url': 'https://www.mzaalo.com/play/movie/c0958d9f-f90e-4503-a755-44358758921d/Jamun', + 'info_dict': { + 'id': 'c0958d9f-f90e-4503-a755-44358758921d', + 'title': 'Jamun', + 'ext': 'mp4', + 'description': 'md5:24fe9ebb9bbe5b36f7b54b90ab1e2f31', + 'thumbnails': 'count:15', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5527.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'} + }, { + # Shows + 'url': 'https://www.mzaalo.com/play/original/93d42b2b-f373-4c2d-bca4-997412cb069d/Modi-Season-2-CM-TO-PM/Episode-1:Decision,-Not-Promises', + 'info_dict': { + 'id': '93d42b2b-f373-4c2d-bca4-997412cb069d', + 'title': 'Episode 1:Decision, Not Promises', + 'ext': 'mp4', + 'description': 'md5:16f76058432a54774fbb2561a1955652', + 'thumbnails': 'count:22', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2040.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'} + }, { + # Streams/Clips + 'url': 'https://www.mzaalo.com/play/clip/83cdbcb5-400a-42f1-a1d2-459053cfbda5/Manto-Ki-Kahaaniya', + 'info_dict': { + 'id': '83cdbcb5-400a-42f1-a1d2-459053cfbda5', + 'title': 'Manto Ki Kahaaniya', + 'ext': 'mp4', + 'description': 'md5:c3c5f1d05f0fd1bfcb05b673d1cc9f2f', + 'thumbnails': 'count:3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1937.0, + 'language': 'hin', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + path = (f'partner/streamurl?&assetId={video_id}&getClipDetails=YES' if type_ == 'clip' + else f'api/v2/player/details?assetType={type_.upper()}&assetId={video_id}') + data = self._download_json( + f'https://production.mzaalo.com/platform/{path}', video_id, headers={ + 'Ocp-Apim-Subscription-Key': '1d0caac2702049b89a305929fdf4cbae', + })['data'] + + formats = self._extract_m3u8_formats(data['streamURL'], video_id) + + subtitles = {} + for subs_lang, subs_url in traverse_obj(data, ('subtitles', {dict.items}, ...)): + if url_or_none(subs_url): + subtitles[subs_lang] = [{'url': subs_url, 'ext': 'vtt'}] + + lang = traverse_obj(data, ('language', {str.lower})) + for f in formats: + f['language'] = lang + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('maturity_rating', {parse_age_limit}), + 'thumbnails': ('images', ..., {'url': {url_or_none}}), + 'categories': ('genre', ..., {str}), + }), + } From 03789976d301eaed3e957dbc041573098f6af059 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Tue, 30 May 2023 00:20:07 +0900 Subject: [PATCH 143/501] [extractor/europarl] Rewrite extractor (#7114) Authored by: HobbyistDev Closes #6396 --- yt_dlp/extractor/europa.py | 82 +++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 29daabe4a3..f3da95f5c9 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -6,6 +6,7 @@ parse_iso8601, parse_qs, qualities, + traverse_obj, unified_strdate, xpath_text ) @@ -92,42 +93,17 @@ def get_item(type_, preference): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:multimedia|webstreaming)\.europarl\.europa\.eu/[^/#?]+/ - (?:embed/embed\.html\?event=|(?!video)[^/#?]+/[\w-]+_)(?P<id>[\w-]+) + https?://multimedia\.europarl\.europa\.eu/[^/#?]+/ + (?:(?!video)[^/#?]+/[\w-]+_)(?P<id>[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', - 'ext': 'mp4', - 'release_timestamp': 1663137900, - 'title': 'Plenary session', - 'release_date': '20220914', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/eu-cop27-un-climate-change-conference-in-sharm-el-sheikh-egypt-ep-delegation-meets-with-ngo-represen_20221114-1600-SPECIAL-OTHER', - 'info_dict': { - 'id': 'a8428de8-b9cd-6a2e-11e4-3805d9c9ff5c', - 'ext': 'mp4', - 'release_timestamp': 1668434400, - 'release_date': '20221114', - 'title': 'md5:d3550280c33cc70e0678652e3d52c028', - }, - 'params': { - 'skip_download': True, - } - }, { - # embed webpage - 'url': 'https://webstreaming.europarl.europa.eu/ep/embed/embed.html?event=20220914-0900-PLENARY&language=en&autoplay=true&logo=true', - 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', 'ext': 'mp4', 'title': 'Plenary session', + 'release_timestamp': 1663139069, 'release_date': '20220914', - 'release_timestamp': 1663137900, }, 'params': { 'skip_download': True, @@ -144,30 +120,54 @@ class EuroParlWebstreamIE(InfoExtractor): 'live_status': 'is_live', }, 'skip': 'not live anymore' + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', + 'info_dict': { + 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', + 'ext': 'mp4', + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, + } + }, { + # live stream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'info_dict': { + 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9', + 'ext': 'mp4', + 'release_date': '20230524', + 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', + 'release_timestamp': 1684911541, + 'live_status': 'is_live', + }, + 'skip': 'Not live anymore' }] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] json_info = self._download_json( - 'https://vis-api.vuplay.co.uk/event/external', display_id, + 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id, query={ - 'player_key': 'europarl|718f822c-a48c-4841-9947-c9cb9bb1743c', - 'external_id': display_id, + 'api-version': 1.0, + 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968', + 'externalReference': display_id }) - formats, subtitles = self._extract_mpd_formats_and_subtitles(json_info['streaming_url'], display_id) - fmts, subs = self._extract_m3u8_formats_and_subtitles( - json_info['streaming_url'].replace('.mpd', '.m3u8'), display_id) - - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) + formats, subtitles = [], {} + for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')): + fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id) + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) return { 'id': json_info['id'], - 'title': json_info.get('title'), + 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), 'formats': formats, 'subtitles': subtitles, - 'release_timestamp': parse_iso8601(json_info.get('published_start')), - 'is_live': 'LIVE' in json_info.get('state', '') + 'release_timestamp': parse_iso8601(json_info.get('startDateTime')), + 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live' } From 6f10cdcf7eeaeae5b75e0a4428cd649c156a2d83 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Mon, 29 May 2023 09:30:30 -0600 Subject: [PATCH 144/501] [extractor/bilibili:SpaceVideo] Extract signature (#7149) Authored by: elyse0 Closes #6956, closes #7081 --- yt_dlp/extractor/bilibili.py | 57 ++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index faa2218ced..6629fbc08c 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1,7 +1,9 @@ import base64 import functools +import hashlib import itertools import math +import time import urllib.error import urllib.parse @@ -26,6 +28,7 @@ srt_subtitles_timecode, str_or_none, traverse_obj, + try_call, unified_timestamp, unsmuggle_url, url_or_none, @@ -514,19 +517,63 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): 'id': '3985676', }, 'playlist_mincount': 178, + }, { + 'url': 'https://space.bilibili.com/313580179/video', + 'info_dict': { + 'id': '313580179', + }, + 'playlist_mincount': 92, }] + def _extract_signature(self, playlist_id): + session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False) + + key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0] + img_key = traverse_obj( + session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100' + sub_key = traverse_obj( + session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6' + + session_key = img_key + sub_key + + signature_values = [] + for position in ( + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, + 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, + 57, 62, 11, 36, 20, 34, 44, 52 + ): + char_at_position = try_call(lambda: session_key[position]) + if char_at_position: + signature_values.append(char_at_position) + + return ''.join(signature_values)[:32] + def _real_extract(self, url): playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') if not is_video_url: self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' 'To download audios, add a "/audio" to the URL') + signature = self._extract_signature(playlist_id) + def fetch_page(page_idx): + query = { + 'keyword': '', + 'mid': playlist_id, + 'order': 'pubdate', + 'order_avoided': 'true', + 'platform': 'web', + 'pn': page_idx + 1, + 'ps': 30, + 'tid': 0, + 'web_location': 1550101, + 'wts': int(time.time()), + } + query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest() + try: - response = self._download_json('https://api.bilibili.com/x/space/arc/search', - playlist_id, note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', + playlist_id, note=f'Downloading page {page_idx}', query=query) except ExtractorError as e: if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: raise ExtractorError( @@ -556,9 +603,9 @@ def get_entries(page_data): class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/audio', + 'url': 'https://space.bilibili.com/313580179/audio', 'info_dict': { - 'id': '3985676', + 'id': '313580179', }, 'playlist_mincount': 1, }] From 26c517b29c8727e47948d6fff749d5297f0efb60 Mon Sep 17 00:00:00 2001 From: Mohamed Al Mehairbi <62325490+ItzMaxTV@users.noreply.github.com> Date: Tue, 30 May 2023 17:40:56 +0400 Subject: [PATCH 145/501] [extractor/crtvg] Add extractor (#7168) Closes #6609 Authored by: ItzMaxTV --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/crtvg.py | 34 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 yt_dlp/extractor/crtvg.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 5b4ed44ef4..85c584f5e1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -411,6 +411,7 @@ CrowdBunkerIE, CrowdBunkerChannelIE, ) +from .crtvg import CrtvgIE from .crunchyroll import ( CrunchyrollBetaIE, CrunchyrollBetaShowIE, diff --git a/yt_dlp/extractor/crtvg.py b/yt_dlp/extractor/crtvg.py new file mode 100644 index 0000000000..1aa8d77055 --- /dev/null +++ b/yt_dlp/extractor/crtvg.py @@ -0,0 +1,34 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class CrtvgIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crtvg\.es/tvg/a-carta/[^/#?]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.crtvg.es/tvg/a-carta/os-caimans-do-tea-5839623', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': '5839623', + 'title': 'Os caimáns do Tea', + 'ext': 'mp4', + 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'var\s+url\s*=\s*["\']([^"\']+)', webpage, 'video url') + formats = self._extract_m3u8_formats(video_url + '/playlist.m3u8', video_id, fatal=False) + formats.extend(self._extract_mpd_formats(video_url + '/manifest.mpd', video_id, fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + 'title': remove_end(self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None), ' | CRTVG'), + 'description': self._html_search_meta('description', webpage, 'description', default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=None), + } From 1fe5bf240e6ade487d18079a62aa36bcc440a27a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 30 May 2023 10:43:01 -0500 Subject: [PATCH 146/501] [extractor/bravotv] Detect DRM (#7171) Authored by: bashonly --- yt_dlp/extractor/bravotv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py index d4bf9b53b7..13cc1927f1 100644 --- a/yt_dlp/extractor/bravotv.py +++ b/yt_dlp/extractor/bravotv.py @@ -1,5 +1,6 @@ from .adobepass import AdobePassIE from ..utils import ( + HEADRequest, extract_attributes, float_or_none, get_element_html_by_class, @@ -153,8 +154,11 @@ def _real_extract(self, url): if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): chapters = None - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - update_url_query(f'{tp_url}/stream.m3u8', query), video_id, 'mp4', m3u8_id='hls') + m3u8_url = self._request_webpage(HEADRequest( + update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').geturl() + if 'mpeg_cenc' in m3u8_url: + self.report_drm(video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, From c2502cfed91415c7ccfff925fd3404d230046484 Mon Sep 17 00:00:00 2001 From: mrscrapy <116454146+mrscrapy@users.noreply.github.com> Date: Wed, 31 May 2023 04:41:21 +0100 Subject: [PATCH 147/501] [extractor/recurbate] Add extractor (#6297) Authored by: mrscrapy --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/recurbate.py | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 yt_dlp/extractor/recurbate.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 85c584f5e1..abe3c22889 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1594,6 +1594,7 @@ RCTIPlusTVIE, ) from .rds import RDSIE +from .recurbate import RecurbateIE from .redbee import ParliamentLiveUKIE, RTBFIE from .redbulltv import ( RedBullTVIE, diff --git a/yt_dlp/extractor/recurbate.py b/yt_dlp/extractor/recurbate.py new file mode 100644 index 0000000000..5534cf3c35 --- /dev/null +++ b/yt_dlp/extractor/recurbate.py @@ -0,0 +1,43 @@ +import urllib.error + +from .common import InfoExtractor +from ..utils import ExtractorError, merge_dicts + + +class RecurbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?recurbate\.com/play\.php\?video=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://recurbate.com/play.php?video=39161415', + 'md5': 'dd2b4ec57aa3e3572cb5cf0997fca99f', + 'info_dict': { + 'id': '39161415', + 'ext': 'mp4', + 'description': 'md5:db48d09e4d93fc715f47fd3d6b7edd51', + 'title': 'Performer zsnicole33 show on 2022-10-25 20:23, Chaturbate Archive – Recurbate', + 'age_limit': 18, + }, + 'skip': 'Website require membership.', + }] + + def _real_extract(self, url): + SUBSCRIPTION_MISSING_MESSAGE = 'This video is only available for registered users; Set your authenticated browser user agent via the --user-agent parameter.' + video_id = self._match_id(url) + try: + webpage = self._download_webpage(url, video_id) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') + raise + token = self._html_search_regex(r'data-token="([^"]+)"', webpage, 'token') + video_url = f'https://recurbate.com/api/get.php?video={video_id}&token={token}' + + video_webpage = self._download_webpage(video_url, video_id) + if video_webpage == 'shall_subscribe': + self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') + entries = self._parse_html5_media_entries(video_url, video_webpage, video_id) + return merge_dicts({ + 'id': video_id, + 'title': self._html_extract_title(webpage, 'title'), + 'description': self._og_search_description(webpage), + 'age_limit': self._rta_search(webpage), + }, entries[0]) From 18f8fba7c89a87f99cc3313a1795848867e84fff Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Wed, 31 May 2023 19:08:28 +1200 Subject: [PATCH 148/501] [extractor/youtube] Fix continuation loop with no comments (#7148) Deep check the response for incomplete data. Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 3f0a4cd20a..ae4b58205f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3314,7 +3314,7 @@ def extract_header(contents): expected_comment_count = self._get_count( comments_header_renderer, 'countText', 'commentsCount') - if expected_comment_count: + if expected_comment_count is not None: tracker['est_total'] = expected_comment_count self.to_screen(f'Downloading ~{expected_comment_count} comments') comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top @@ -3385,7 +3385,7 @@ def extract_thread(contents): if not tracker: tracker = dict( running_total=0, - est_total=0, + est_total=None, current_page_thread=0, total_parent_comments=0, total_reply_comments=0, @@ -3418,11 +3418,13 @@ def extract_thread(contents): continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id)) is_forced_continuation = True + continuation_items_path = ( + 'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems') for page_num in itertools.count(0): if not continuation: break headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response)) - comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})" + comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})" if page_num == 0: if is_first_continuation: note_prefix = 'Downloading comment section API JSON' @@ -3433,11 +3435,18 @@ def extract_thread(contents): note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) + + # Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation + # Ignore check if YouTube says the comment count is 0. + check_get_keys = None + if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): + check_get_keys = [[*continuation_items_path, ..., ( + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] try: response = self._extract_response( item_id=None, query=continuation, ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, - check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) + check_get_keys=check_get_keys) except ExtractorError as e: # Ignore incomplete data error for replies if retries didn't work. # This is to allow any other parent comments and comment threads to be downloaded. @@ -3449,15 +3458,8 @@ def extract_thread(contents): else: raise is_forced_continuation = False - continuation_contents = traverse_obj( - response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) - continuation = None - for continuation_section in continuation_contents: - continuation_items = traverse_obj( - continuation_section, - (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'), - get_all=False, expected_type=list) or [] + for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): if is_first_continuation: continuation = extract_header(continuation_items) is_first_continuation = False From ecfe47973f6603b5367fe2cc3c65274627d94516 Mon Sep 17 00:00:00 2001 From: Mohamed Al Mehairbi <62325490+ItzMaxTV@users.noreply.github.com> Date: Wed, 31 May 2023 17:12:56 +0400 Subject: [PATCH 149/501] [extractor/elevensports] Add extractor (#7172) Closes #6737 Authored by: ItzMaxTV --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/elevensports.py | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 yt_dlp/extractor/elevensports.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index abe3c22889..808ede5bac 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -550,6 +550,7 @@ from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE +from .elevensports import ElevenSportsIE from .ellentube import ( EllenTubeIE, EllenTubeVideoIE, diff --git a/yt_dlp/extractor/elevensports.py b/yt_dlp/extractor/elevensports.py new file mode 100644 index 0000000000..99c52b3a9a --- /dev/null +++ b/yt_dlp/extractor/elevensports.py @@ -0,0 +1,59 @@ +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class ElevenSportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?elevensports\.com/view/event/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://elevensports.com/view/event/clf46yr3kenn80jgrqsjmwefk', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clf46yr3kenn80jgrqsjmwefk', + 'title': 'Cleveland SC vs Lionsbridge FC', + 'ext': 'mp4', + 'description': 'md5:03b5238d6549f4ea1fddadf69b5e0b58', + 'upload_date': '20230323', + 'timestamp': 1679612400, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://elevensports.com/view/event/clhpyd53b06160jez74qhgkmf', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clhpyd53b06160jez74qhgkmf', + 'title': 'AJNLF vs ARRAF', + 'ext': 'mp4', + 'description': 'md5:c8c5e75c78f37c6d15cd6c475e43a8c1', + 'upload_date': '20230521', + 'timestamp': 1684684800, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + event_id = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['event']['mclsEventId'] + event_data = self._download_json( + f'https://mcls-api.mycujoo.tv/bff/events/v1beta1/{event_id}', video_id, + headers={'Authorization': 'Bearer FBVKACGN37JQC5SFA0OVK8KKSIOP153G'}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + event_data['streams'][0]['full_url'], video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(event_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('start_time', {parse_iso8601}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } From 1c16d9df5330819cc79ad588b24aa5b72765c168 Mon Sep 17 00:00:00 2001 From: CeruleanSky <CeruleanSky@users.noreply.github.com> Date: Thu, 1 Jun 2023 02:35:41 -0400 Subject: [PATCH 150/501] [extractor/twitter:spaces] Add `release_timestamp` (#7186) Authored by: CeruleanSky --- yt_dlp/extractor/twitter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index d9a89c44b6..4624ce5035 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -705,6 +705,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': r're:Monique Camarra.+?', 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', + 'release_timestamp': 1658417414, 'description': 'md5:acce559345fd49f129c20dbcda3f1201', 'timestamp': 1658407771464, }, @@ -1327,6 +1328,8 @@ def _real_extract(self, url): 'uploader_id': traverse_obj( metadata, ('creator_results', 'result', 'legacy', 'screen_name')), 'live_status': live_status, + 'release_timestamp': try_call( + lambda: int_or_none(metadata['scheduled_start'], scale=1000)), 'timestamp': metadata.get('created_at'), 'formats': formats, } From c35448b7b14113b35c4415dbfbf488c4731f006f Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Thu, 1 Jun 2023 20:43:32 +1200 Subject: [PATCH 151/501] [extractor/youtube] Extract more metadata for comments (#7179) Adds new comment fields: * `author_url` - The url to the comment author's page * `author_is_verified` - Whether the author is verified on the platform * `is_pinned` - Whether the comment is pinned to the top of the comments Closes https://github.com/yt-dlp/yt-dlp/issues/5411 Authored by: coletdjnz --- yt_dlp/extractor/common.py | 9 +++-- yt_dlp/extractor/youtube.py | 68 ++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 306911a6c7..fa46a5240f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -314,6 +314,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -325,8 +330,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to yt-dlp it should allow to get the same result again. (It will be set diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ae4b58205f..ccf97705a1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3271,37 +3271,50 @@ def _extract_comment(self, comment_renderer, parent=None): if not comment_id: return - text = self._get_text(comment_renderer, 'contentText') + info = { + 'id': comment_id, + 'text': self._get_text(comment_renderer, 'contentText'), + 'like_count': self._get_count(comment_renderer, 'voteCount'), + 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})), + 'author': self._get_text(comment_renderer, 'authorText'), + 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})), + 'parent': parent or 'root', + } # Timestamp is an estimate calculated from the current time and time_text time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' timestamp = self._parse_time_text(time_text) - author = self._get_text(comment_renderer, 'authorText') - author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) - - votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), str)) or 0 - author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) - - author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) - is_favorited = 'creatorHeart' in (try_get( - comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {}) - return { - 'id': comment_id, - 'text': text, + info.update({ + # FIXME: non-standard, but we need a way of showing that it is an estimate. + '_time_text': time_text, 'timestamp': timestamp, - 'time_text': time_text, - 'like_count': votes, - 'is_favorited': is_favorited, - 'author': author, - 'author_id': author_id, - 'author_thumbnail': author_thumbnail, - 'author_is_uploader': author_is_uploader, - 'parent': parent or 'root' - } + }) + + info['author_url'] = urljoin( + 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))), + expected_type=str, get_all=False)) + + author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner') + if author_is_uploader is not None: + info['author_is_uploader'] = author_is_uploader + + comment_abr = traverse_obj( + comment_renderer, ('actionsButtons', 'commentActionButtonsRenderer'), expected_type=dict) + if comment_abr is not None: + info['is_favorited'] = 'creatorHeart' in comment_abr + + comment_ab_icontype = traverse_obj( + comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType')) + if comment_ab_icontype is not None: + info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE') + + is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') + if is_pinned: + info['is_pinned'] = True + + return info def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): @@ -3349,14 +3362,13 @@ def extract_thread(contents): comment = self._extract_comment(comment_renderer, parent) if not comment: continue - is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge')) comment_id = comment['id'] - if is_pinned: + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. # See: https://github.com/yt-dlp/yt-dlp/issues/6290 if comment_id in tracker['seen_comment_ids']: - if comment_id in tracker['pinned_comment_ids'] and not is_pinned: + if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'): # Pinned comments may appear a second time in newest first sort # See: https://github.com/yt-dlp/yt-dlp/issues/6712 continue From f41b949a2ef646fbc36375febbe3f0c19d742c0f Mon Sep 17 00:00:00 2001 From: Daniel Rich <drich@employees.org> Date: Thu, 1 Jun 2023 14:52:03 -0700 Subject: [PATCH 152/501] [extractor/nhk] Fix API extraction (#7180) Closes #6992 Authored by: sjthespian, menschel Co-authored-by: Patrick Menschel <menschel.p@posteo.de> --- yt_dlp/extractor/nhk.py | 36 ++++++++++++++++++++++++++++-------- yt_dlp/extractor/piksel.py | 16 +++++++++------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 1597962acf..a3efa326a1 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -67,7 +67,7 @@ def get_clean_field(key): info.update({ '_type': 'url_transparent', 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id, 'id': vod_id, }) else: @@ -94,6 +94,19 @@ class NhkVodIE(NhkBaseIE): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'info_dict': { + 'id': 'yd8322ch', + 'ext': 'mp4', + 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', + 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', + 'upload_date': '20230514', + 'timestamp': 1684083791, + 'series': 'GRAND SUMO Highlights', + 'episode': '[Recap] May Tournament Day 1 (Opening Day)', + 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + }, + }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', @@ -104,6 +117,9 @@ class NhkVodIE(NhkBaseIE): 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', 'timestamp': 1565965194, 'upload_date': '20190816', + 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'series': 'Dining with the Chef', + 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', }, }, { # audio clip @@ -114,10 +130,7 @@ class NhkVodIE(NhkBaseIE): 'title': "Japan's Top Inventions - Miniature Video Cameras", 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -133,7 +146,6 @@ class NhkVodIE(NhkBaseIE): }, { # video, alphabetic character in ID #29670 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', - 'only_matching': True, 'info_dict': { 'id': 'qfjay6cg', 'ext': 'mp4', @@ -142,7 +154,8 @@ class NhkVodIE(NhkBaseIE): 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', 'upload_date': '20210615', 'timestamp': 1623722008, - } + }, + 'skip': '404 Not Found', }] def _real_extract(self, url): @@ -153,12 +166,19 @@ class NhkVodProgramIE(NhkBaseIE): _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) _TESTS = [{ # video program episodes + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', + 'info_dict': { + 'id': 'sumo', + 'title': 'GRAND SUMO Highlights', + }, + 'playlist_mincount': 12, + }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', }, - 'playlist_mincount': 1, + 'playlist_mincount': 12, }, { # video program clips 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py index cc60b304e5..97a9bf5745 100644 --- a/yt_dlp/extractor/piksel.py +++ b/yt_dlp/extractor/piksel.py @@ -7,8 +7,10 @@ int_or_none, join_nonempty, parse_iso8601, + traverse_obj, try_get, unescapeHTML, + urljoin, ) @@ -63,11 +65,11 @@ class PikselIE(InfoExtractor): } ] - def _call_api(self, app_token, resource, display_id, query, fatal=True): - response = (self._download_json( - 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), - display_id, query=query, fatal=fatal) or {}).get('response') - failure = try_get(response, lambda x: x['failure']['reason']) + def _call_api(self, app_token, resource, display_id, query, host='https://player.piksel.com', fatal=True): + url = urljoin(host, f'/ws/ws_{resource}/api/{app_token}/mode/json/apiv/5') + response = traverse_obj( + self._download_json(url, display_id, query=query, fatal=fatal), ('response', {dict})) or {} + failure = traverse_obj(response, ('failure', 'reason')) if response else 'Empty response from API' if failure: if fatal: raise ExtractorError(failure, expected=True) @@ -83,7 +85,7 @@ def _real_extract(self, url): ], webpage, 'app token') query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} program = self._call_api( - app_token, 'program', display_id, query)['WsProgramResponse']['program'] + app_token, 'program', display_id, query, url)['WsProgramResponse']['program'] video_id = program['uuid'] video_data = program['asset'] title = video_data['title'] @@ -129,7 +131,7 @@ def process_asset_files(asset_files): process_asset_files(try_get(self._call_api( app_token, 'asset_file', display_id, { 'assetid': asset_id, - }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + }, url, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) m3u8_url = dict_get(video_data, [ 'm3u8iPadURL', From 01231feb142e80828985aabdec04ac608e3d43e2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 2 Jun 2023 08:39:24 -0500 Subject: [PATCH 153/501] [extractor/twitch] Update `_CLIENT_ID` and add extractor-arg (#7200) Closes #7058, Closes #7183 Authored by: bashonly --- README.md | 3 +++ yt_dlp/extractor/twitch.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 25ed3b8441..3d89c0af94 100644 --- a/README.md +++ b/README.md @@ -1846,6 +1846,9 @@ #### twitter ### wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage +#### twitchstream (Twitch) +* `client_id`: Client ID value to be sent with GraphQL requests, e.g. `twitchstream:client_id=kimne78kx3ncx6brgo4mv6wki5h1ko` + **Note**: These options may be changed/removed in the future without concern for backward compatibility <!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE --> diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 4a17d80489..31b349bc68 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -41,7 +41,6 @@ class TwitchBaseIE(InfoExtractor): _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_FORM_URL = 'https://www.twitch.tv/login' _LOGIN_POST_URL = 'https://passport.twitch.tv/login' - _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' _OPERATION_HASHES = { @@ -58,6 +57,11 @@ class TwitchBaseIE(InfoExtractor): 'VideoPlayer_VODSeekbarPreviewVideo': '07e99e4d56c5a7c67117a154777b0baf85a5ffefa393b213f4bc712ccaf85dd6', } + @property + def _CLIENT_ID(self): + return self._configuration_arg( + 'client_id', ['ue6666qo983tsx6so1t0vnawi233wa'], ie_key=TwitchStreamIE, casesense=True)[0] + def _perform_login(self, username, password): def fail(message): raise ExtractorError( From 55ed4ff73487feb3177b037dfc2ea527e777da3e Mon Sep 17 00:00:00 2001 From: Mohamed Al Mehairbi <62325490+ItzMaxTV@users.noreply.github.com> Date: Fri, 2 Jun 2023 19:01:55 +0400 Subject: [PATCH 154/501] [extractor/DigitalConcertHall] Support films (#7202) Authored by: ItzMaxTV Closes #7184 --- yt_dlp/extractor/digitalconcerthall.py | 27 +++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/digitalconcerthall.py b/yt_dlp/extractor/digitalconcerthall.py index 3461e36eb6..c11cd790b0 100644 --- a/yt_dlp/extractor/digitalconcerthall.py +++ b/yt_dlp/extractor/digitalconcerthall.py @@ -11,7 +11,7 @@ class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' - _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/concert/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert)/(?P<id>[0-9]+)' _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' _ACCESS_TOKEN = None _NETRC_MACHINE = 'digitalconcerthall' @@ -40,6 +40,19 @@ class DigitalConcertHallIE(InfoExtractor): }, 'params': {'skip_download': 'm3u8'}, 'playlist_count': 3, + }, { + 'url': 'https://www.digitalconcerthall.com/en/film/388', + 'info_dict': { + 'id': '388', + 'ext': 'mp4', + 'title': 'The Berliner Philharmoniker and Frank Peter Zimmermann', + 'description': 'md5:cfe25a7044fa4be13743e5089b5b5eb2', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20220714', + 'timestamp': 1657785600, + 'album_artist': 'Frank Peter Zimmermann / Benedikt von Bernstorff / Jakob von Bernstorff', + }, + 'params': {'skip_download': 'm3u8'}, }] def _perform_login(self, username, password): @@ -75,7 +88,7 @@ def _real_initialize(self): if not self._ACCESS_TOKEN: self.raise_login_required(method='password') - def _entries(self, items, language, **kwargs): + def _entries(self, items, language, type_, **kwargs): for item in items: video_id = item['id'] stream_info = self._download_json( @@ -103,11 +116,11 @@ def _entries(self, items, language, **kwargs): 'start_time': chapter.get('time'), 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), 'title': chapter.get('text'), - } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } for chapter in item['cuepoints']] if item.get('cuepoints') and type_ == 'concert' else None, } def _real_extract(self, url): - language, video_id = self._match_valid_url(url).group('language', 'id') + language, type_, video_id = self._match_valid_url(url).group('language', 'type', 'id') if not language: language = 'en' @@ -120,18 +133,18 @@ def _real_extract(self, url): }] vid_info = self._download_json( - f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + f'https://api.digitalconcerthall.com/v2/{type_}/{video_id}', video_id, headers={ 'Accept': 'application/json', 'Accept-Language': language }) album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...)) return { '_type': 'playlist', 'id': video_id, 'title': vid_info.get('title'), - 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, - thumbnails=thumbnails, album_artist=album_artist), + 'entries': self._entries(videos, language, thumbnails=thumbnails, album_artist=album_artist, type_=type_), 'thumbnails': thumbnails, 'album_artist': album_artist, } From 1a7dcca378e80a387923ee05c250d8ba122441c6 Mon Sep 17 00:00:00 2001 From: Jeroen Jacobs <git@jeroenj.be> Date: Fri, 2 Jun 2023 20:29:00 +0200 Subject: [PATCH 155/501] [extractor/vrt] Overhaul extractors (#6244) * Fixes `VrtNU` extractor to work with the VRT MAX site change * Adapts `VRT`, `Ketnet` and `DagelijkseKost` extractors to the new VRT API * Removes `Canvas` and `CanvasEen` extractors; the sites and API no longer exist * Moves all remaining VRT-related extractors into the `vrt` module Closes #4908 Authored by: jeroenj, bergoid, bashonly Co-authored-by: bergoid <bergoid@users.noreply.github.com> Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 14 +- yt_dlp/extractor/canvas.py | 383 ----------------------------- yt_dlp/extractor/ketnet.py | 70 ------ yt_dlp/extractor/vrt.py | 413 +++++++++++++++++++++++++++++--- 4 files changed, 384 insertions(+), 496 deletions(-) delete mode 100644 yt_dlp/extractor/canvas.py delete mode 100644 yt_dlp/extractor/ketnet.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 808ede5bac..7120fd37d1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -295,12 +295,6 @@ from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, @@ -894,7 +888,6 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, @@ -2285,7 +2278,12 @@ VoxMediaVolumeIE, VoxMediaIE, ) -from .vrt import VRTIE +from .vrt import ( + VRTIE, + VrtNUIE, + KetnetIE, + DagelijkseKostIE, +) from .vrak import VrakIE from .vrv import ( VRVIE, diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py deleted file mode 100644 index ae6e03a4d5..0000000000 --- a/yt_dlp/extractor/canvas.py +++ /dev/null @@ -1,383 +0,0 @@ -import json - - -from .common import InfoExtractor -from .gigya import GigyaBaseIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - clean_html, - extract_attributes, - float_or_none, - get_element_by_class, - int_or_none, - merge_dicts, - str_or_none, - strip_or_none, - url_or_none, - urlencode_postdata -) - - -class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', - 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'only_matching': True, - }] - _GEO_BYPASS = False - _HLS_ENTRY_PROTOCOLS_MAP = { - 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8_native', - } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, video_id = mobj.group('site_id'), mobj.group('id') - - data = None - if site_id != 'vrtvideo': - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json; charset=utf-8'}) - vrtPlayerToken = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', headers=headers, data=json.dumps({ - 'identityToken': vrtnutoken - }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': vrtPlayerToken, - 'client': 'null', - }, expected_status=400) - if 'title' not in data: - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - # Note: The title may be an empty string - title = data['title'] or f'{site_id} {video_id}' - description = data.get('description') - - formats = [] - subtitles = {} - for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) - if not format_url or not format_type: - continue - format_type = format_type.upper() - if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - fmts, subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - fmts, subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HSS': - fmts, subs = self._extract_ism_formats_and_subtitles( - format_url, video_id, ism_id='mss', fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } - - -class CanvasEenIE(InfoExtractor): - IE_DESC = 'canvas.be and een.be' - _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ed66976748d12350b118455979cca293', - 'info_dict': { - 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'flv', - 'title': 'De afspraak veilt voor de Warmste Week', - 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 49.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # with subtitles - 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', - 'info_dict': { - 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', - 'display_id': 'pieter-0167', - 'ext': 'mp4', - 'title': 'Pieter 0167', - 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2553.08, - 'subtitles': { - 'nl': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Pagina niet gevonden', - }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', - 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', - 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, display_id = mobj.group('site_id'), mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(self._search_regex( - r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None)) - - video_id = self._html_search_regex( - r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - } - - -class VrtNUIE(GigyaBaseIE): - IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' - _TESTS = [{ - # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', - 'info_dict': { - 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', - 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', - 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', - 'duration': 1457.04, - 'thumbnail': r're:^https?://.*\.jpg$', - 'series': 'Postbus X', - 'season': 'Seizoen 1989', - 'season_number': 1989, - 'episode': 'De zwarte weduwe', - 'episode_number': 1, - 'timestamp': 1595822400, - 'upload_date': '20200727', - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], - }] - _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' - _CONTEXT_ID = 'R3595707040' - - def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) - - if auth_info.get('errorDetails'): - raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - self._request_webpage('https://token.vrt.be/vrtnuinitlogin', - None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', - query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) - - post_data = { - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - } - - self._request_webpage( - 'https://login.vrt.be/perform_login', - None, note='Performing login', errnote='perform login failed', - headers={}, query={ - 'client_id': 'vrtnu-site' - }, data=urlencode_postdata(post_data)) - - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - attrs = extract_attributes(self._search_regex( - r'(<nui-media[^>]+>)', webpage, 'media element')) - video_id = attrs['videoid'] - publication_id = attrs.get('publicationid') - if publication_id: - video_id = publication_id + '$' + video_id - - page = (self._parse_json(self._search_regex( - r'digitalData\s*=\s*({.+?});', webpage, 'digial data', - default='{}'), video_id, fatal=False) or {}).get('page') or {} - - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts(info, { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'season_number': int_or_none(page.get('episode_season')), - }) - - -class DagelijkseKostIE(InfoExtractor): - IE_DESC = 'dagelijksekost.een.be' - _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'md5': '30bfffc323009a3e5f689bef6efa2365', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'display_id': 'hachis-parmentier-met-witloof', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 283.02, - }, - 'expected_warnings': ['is not a supported codec'], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage - ) or self._html_search_meta( - 'twitter:title', webpage)) - - description = clean_html(get_element_by_class( - 'dish-description', webpage) - ) or self._html_search_meta( - ('description', 'twitter:description', 'og:description'), - webpage) - - video_id = self._html_search_regex( - r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - } diff --git a/yt_dlp/extractor/ketnet.py b/yt_dlp/extractor/ketnet.py deleted file mode 100644 index ab6276727a..0000000000 --- a/yt_dlp/extractor/ketnet.py +++ /dev/null @@ -1,70 +0,0 @@ -from .canvas import CanvasIE -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', - 'ext': 'mp4', - 'title': 'Nachtwacht - Reeks 3: Aflevering 1', - 'description': 'De Nachtwacht krijgt te maken met een parasiet', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - 'timestamp': 1609225200, - 'upload_date': '20201229', - 'series': 'Nachtwacht', - 'season': 'Reeks 3', - 'episode': 'De Greystook', - 'episode_number': 1, - }, - 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], - }, { - 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ - 'query': '''{ - video(id: "content/ketnet/nl/%s.model.json") { - description - episodeNr - imageUrl - mediaReference - programTitle - publicationDate - seasonTitle - subtitleVideodetail - titleVideodetail - } -}''' % display_id, - })['data']['video'] - - mz_id = compat_urllib_parse_unquote(video['mediaReference']) - - return { - '_type': 'url_transparent', - 'id': mz_id, - 'title': video['titleVideodetail'], - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, - 'thumbnail': video.get('imageUrl'), - 'description': video.get('description'), - 'timestamp': parse_iso8601(video.get('publicationDate')), - 'series': video.get('programTitle'), - 'season': video.get('seasonTitle'), - 'episode': video.get('subtitleVideodetail'), - 'episode_number': int_or_none(video.get('episodeNr')), - 'ie_key': CanvasIE.ie_key(), - } diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 26f48bf67f..bacd3df29a 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,45 +1,137 @@ -from .common import InfoExtractor +import functools +import json +import time +import urllib.error +import urllib.parse + +from .gigya import GigyaBaseIE from ..utils import ( + ExtractorError, + clean_html, extract_attributes, float_or_none, get_element_by_class, + get_element_html_by_class, + int_or_none, + join_nonempty, + jwt_encode_hs256, + make_archive_id, + parse_age_limit, + parse_iso8601, + str_or_none, strip_or_none, - unified_timestamp, + traverse_obj, + url_or_none, + urlencode_postdata, ) -class VRTIE(InfoExtractor): +class VRTBaseIE(GigyaBaseIE): + _GEO_BYPASS = False + _PLAYER_INFO = { + 'platform': 'desktop', + 'app': { + 'type': 'browser', + 'name': 'Chrome', + }, + 'device': 'undefined (undefined)', + 'os': { + 'name': 'Windows', + 'version': 'x86_64' + }, + 'player': { + 'name': 'VRT web player', + 'version': '2.7.4-prod-2023-04-19T06:05:45' + } + } + # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.fd1de01a40a1e3d842ea.js + _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' + _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' + + def _extract_formats_and_subtitles(self, data, video_id): + if traverse_obj(data, 'drm'): + self.report_drm(video_id) + + formats, subtitles = [], {} + for target in traverse_obj(data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type'])): + format_type = target['type'].upper() + format_url = target['url'] + if format_type in ('HLS', 'HLS_AES'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_type == 'HSS': + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + + for sub in traverse_obj(data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED')): + subtitles.setdefault('nl', []).append({'url': sub['url']}) + + return formats, subtitles + + def _call_api(self, video_id, client='null', id_token=None, version='v2'): + player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} + player_token = self._download_json( + 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', + video_id, 'Downloading player token', headers={ + **self.geo_verification_headers(), + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'identityToken': id_token or {}, + 'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ + 'kid': self._JWT_KEY_ID + }).decode() + }, separators=(',', ':')).encode())['vrtPlayerToken'] + + return self._download_json( + f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}', + video_id, 'Downloading API JSON', query={ + 'vrtPlayerToken': player_token, + 'client': client, + }, expected_status=400) + + +class VRTIE(VRTBaseIE): IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', - 'md5': 'e1663accf5cf13f375f3cd0d10476669', 'info_dict': { 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', 'ext': 'mp4', 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', - 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', - 'timestamp': 1557924660, - 'upload_date': '20190515', + 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff', 'duration': 31.2, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', - 'md5': '910bba927566e9ab992278f647eb4b75', 'info_dict': { 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', 'ext': 'mp4', - 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', - 'timestamp': 1557923760, - 'upload_date': '20190515', + 'title': 'De Belgian Cats zijn klaar voor het EK', + 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal', 'duration': 115.17, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg', }, - }, { - 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', - 'only_matching': True, - }, { - 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }] _CLIENT_MAP = { 'vrt.be/vrtnws': 'vrtnieuws', @@ -49,34 +141,285 @@ class VRTIE(InfoExtractor): def _real_extract(self, url): site, display_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, display_id) - attrs = extract_attributes(self._search_regex( - r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video')) + attrs = extract_attributes(get_element_html_by_class('vrtvideo', webpage) or '') - asset_id = attrs['data-video-id'] - publication_id = attrs.get('data-publication-id') + asset_id = attrs.get('data-video-id') or attrs['data-videoid'] + publication_id = traverse_obj(attrs, 'data-publication-id', 'data-publicationid') if publication_id: - asset_id = publication_id + '$' + asset_id - client = attrs.get('data-client-code') or self._CLIENT_MAP[site] + asset_id = f'{publication_id}${asset_id}' + client = traverse_obj(attrs, 'data-client-code', 'data-client') or self._CLIENT_MAP[site] + + data = self._call_api(asset_id, client) + formats, subtitles = self._extract_formats_and_subtitles(data, asset_id) - title = strip_or_none(get_element_by_class( - 'vrt-title', webpage) or self._html_search_meta( - ['og:title', 'twitter:title', 'name'], webpage)) description = self._html_search_meta( ['og:description', 'twitter:description', 'description'], webpage) if description == '…': description = None - timestamp = unified_timestamp(self._html_search_meta( - 'article:published_time', webpage)) return { - '_type': 'url_transparent', 'id': asset_id, - 'display_id': display_id, - 'title': title, + 'formats': formats, + 'subtitles': subtitles, 'description': description, - 'thumbnail': attrs.get('data-posterimage'), - 'timestamp': timestamp, + 'thumbnail': url_or_none(attrs.get('data-posterimage')), 'duration': float_or_none(attrs.get('data-duration'), 1000), - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), - 'ie_key': 'Canvas', + '_old_archive_ids': [make_archive_id('Canvas', asset_id)], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('shortDescription', {str}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('posterImageUrl', {url_or_none}), + }), + } + + +class VrtNUIE(VRTBaseIE): + IE_DESC = 'VRT MAX' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' + _TESTS = [{ + # CONTENT_IS_AGE_RESTRICTED + 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/', + 'info_dict': { + 'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f', + 'ext': 'mp4', + 'title': 'Tom Waes', + 'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.', + 'timestamp': 1673905125, + 'release_timestamp': 1673905125, + 'series': 'De ideale wereld', + 'season_id': '1672830988794', + 'episode': 'Aflevering 1', + 'episode_number': 1, + 'episode_id': '1672830988861', + 'display_id': 'de-ideale-wereld-d20230116', + 'channel': 'VRT', + 'duration': 1939.0, + 'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg', + 'release_date': '20230116', + 'upload_date': '20230116', + 'age_limit': 12, + }, + }, { + 'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/', + 'info_dict': { + 'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee', + 'ext': 'mp4', + 'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'', + 'description': 'md5:197424726c61384b4e5c519f16c0cf02', + 'timestamp': 1652940000, + 'release_timestamp': 1652940000, + 'series': 'Buurman, wat doet u nu?', + 'season': 'Seizoen 6', + 'season_number': 6, + 'season_id': '1652344200907', + 'episode': 'Aflevering 0', + 'episode_number': 0, + 'episode_id': '1652951873524', + 'display_id': 'buurman--wat-doet-u-nu--s6-trailer', + 'channel': 'VRT', + 'duration': 33.13, + 'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg', + 'release_date': '20220519', + 'upload_date': '20220519', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _NETRC_MACHINE = 'vrtnu' + _authenticated = False + + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) + + if auth_info.get('errorDetails'): + raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) + + # Sometimes authentication fails for no good reason, retry + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, None) + try: + self._request_webpage( + 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', + errnote='Could not get XSRF Token', query={ + 'provider': 'site', + 'destination': 'https://www.vrt.be/vrtnu/', + }) + self._request_webpage( + 'https://login.vrt.be/perform_login', None, + note='Performing login', errnote='Login failed', + query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ + 'UID': auth_info['UID'], + 'UIDSignature': auth_info['UIDSignature'], + 'signatureTimestamp': auth_info['signatureTimestamp'], + '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + })) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + retry.error = e + continue + raise + + self._authenticated = True + + def _real_extract(self, url): + display_id = self._match_id(url) + parsed_url = urllib.parse.urlparse(url) + details = self._download_json( + f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json', + display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details'] + + watch_info = traverse_obj(details, ( + 'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {} + video_id = join_nonempty( + 'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info) + if '$' not in video_id: + raise ExtractorError('Unable to extract video ID') + + vrtnutoken = self._download_json( + 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', + errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None + + video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken) + + if 'title' not in video_info: + code = video_info.get('code') + if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'): + self.raise_login_required(code, method='password') + elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'): + self.raise_geo_restricted(countries=['BE']) + elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS': + if not self._authenticated: + self.raise_login_required(code, method='password') + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(code, expected=True) + + formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id) + + return { + **traverse_obj(details, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), + 'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), + 'series': ('data', 'program', 'title'), + 'season': ('data', 'season', 'title', 'value'), + 'season_number': ('data', 'season', 'title', 'raw', {int_or_none}), + 'season_id': ('data', 'season', 'id', {str_or_none}), + 'episode': ('data', 'episode', 'number', 'value', {str_or_none}), + 'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}), + 'episode_id': ('data', 'episode', 'id', {str_or_none}), + 'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}), + }), + 'id': video_id, + 'display_id': display_id, + 'channel': 'VRT', + 'formats': formats, + 'duration': float_or_none(video_info.get('duration'), 1000), + 'thumbnail': url_or_none(video_info.get('posterImageUrl')), + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('Canvas', video_id)], + } + + +class KetnetIE(VRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', + 'info_dict': { + 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ext': 'mp4', + 'title': 'Meisjes', + 'episode': 'Reeks 6: Week 5', + 'season': 'Reeks 6', + 'series': 'Meisjes', + 'timestamp': 1685251800, + 'upload_date': '20230528', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] + + video_id = urllib.parse.unquote(video['mediaReference']) + data = self._call_api(video_id, 'ketnet@PROD', version='v1') + formats, subtitles = self._extract_formats_and_subtitles(data, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('Canvas', video_id)], + **traverse_obj(video, { + 'title': ('titleVideodetail', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'timestamp': ('publicationDate', {parse_iso8601}), + 'series': ('programTitle', {str}), + 'season': ('seasonTitle', {str}), + 'episode': ('subtitleVideodetail', {str}), + 'episode_number': ('episodeNr', {int_or_none}), + }), + } + + +class DagelijkseKostIE(VRTBaseIE): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'display_id': 'hachis-parmentier-met-witloof', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') + + data = self._call_api(video_id, 'dako@prod', version='v1') + formats, subtitles = self._extract_formats_and_subtitles(data, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'display_id': display_id, + 'title': strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage) or self._html_search_meta('twitter:title', webpage)), + 'description': clean_html(get_element_by_class( + 'dish-description', webpage)) or self._html_search_meta( + ['description', 'twitter:description', 'og:description'], webpage), + '_old_archive_ids': [make_archive_id('Canvas', video_id)], } From 2fb35f6004c7625f0dd493da4a5abf0690f7777c Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 3 Jun 2023 18:33:51 +1200 Subject: [PATCH 156/501] [extractor/youtube] Support shorter relative time format (#7191) See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/1067 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ccf97705a1..6e7485c030 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -893,9 +893,16 @@ def _extract_thumbnails(data, *path_list): def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + + # XXX: this could be moved to a general function in utils.py + # The relative time text strings are roughly the same as what + # Javascript's Intl.RelativeTimeFormat function generates. + # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat + mobj = re.search( + r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>sec(?:ond)?|s|min(?:ute)?|h(?:our|r)?|d(?:ay)?|w(?:eek|k)?|mo(?:nth)?|y(?:ear|r)?)s?\s*ago', + relative_time_text) if mobj: start = mobj.group('start') if start: From c91ac833ea99b00506e470a44cf930e4e23378c9 Mon Sep 17 00:00:00 2001 From: Paul Wise <pabs3@bonedaddy.net> Date: Sun, 4 Jun 2023 16:04:47 +0800 Subject: [PATCH 157/501] [extractor/acast] Support embeds (#7212) Authored by: pabs3 --- yt_dlp/extractor/acast.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/acast.py b/yt_dlp/extractor/acast.py index f2f828f8e7..427d04c312 100644 --- a/yt_dlp/extractor/acast.py +++ b/yt_dlp/extractor/acast.py @@ -40,28 +40,33 @@ def _call_api(self, path, video_id, query=None): class ACastIE(ACastBaseIE): IE_NAME = 'acast' - _VALID_URL = r'''(?x) + _VALID_URL = r'''(?x: https?:// (?: (?:(?:embed|www)\.)?acast\.com/| play\.acast\.com/s/ ) - (?P<channel>[^/]+)/(?P<id>[^/#?]+) - ''' + (?P<channel>[^/]+)/(?P<id>[^/#?"]+) + )''' + _EMBED_REGEX = [rf'(?x)<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', + 'description': 'md5:013959207e05011ad14a222cf22278cc', 'timestamp': 1477346700, 'upload_date': '20161024', 'duration': 2766, - 'creator': 'Anton Berg & Martin Johnson', + 'creator': 'Third Ear Studio', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', + 'thumbnail': 'https://assets.pippa.io/shows/616ebe1886d7b1398620b943/616ebe33c7e6e70013cae7da.jpg', + 'episode_number': 2, + 'display_id': '2.raggarmordet-rosterurdetforflutna', + 'season_number': 4, + 'season': 'Season 4', } }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', @@ -73,6 +78,23 @@ class ACastIE(ACastBaseIE): 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://ausi.anu.edu.au/news/democracy-sausage-episode-can-labor-be-long-form-government', + 'info_dict': { + 'id': '646c68fb21fbf20011e9c651', + 'ext': 'mp3', + 'creator': 'The Australian National University', + 'display_id': 'can-labor-be-a-long-form-government', + 'duration': 2618, + 'thumbnail': 'https://assets.pippa.io/shows/6113e8578b4903809f16f7e5/1684821529295-515b9520db9ce53275b995eb302f941c.jpeg', + 'title': 'Can Labor be a long-form government?', + 'episode': 'Can Labor be a long-form government?', + 'upload_date': '20230523', + 'series': 'Democracy Sausage with Mark Kenny', + 'timestamp': 1684826362, + 'description': 'md5:feabe1fc5004c78ee59c84a46bf4ba16', + } + }] def _real_extract(self, url): channel, display_id = self._match_valid_url(url).groups() From 12037d8b0a578fcc78a5c8f98964e48ee6060e25 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 06:10:30 -0500 Subject: [PATCH 158/501] [extractor/substack] Fix extraction (#7218) Closes #7155 Authored by: bashonly --- yt_dlp/extractor/substack.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index fa3826388b..3782ceed1c 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -2,7 +2,7 @@ import urllib.parse from .common import InfoExtractor -from ..utils import str_or_none, traverse_obj +from ..utils import js_to_json, str_or_none, traverse_obj class SubstackIE(InfoExtractor): @@ -14,7 +14,7 @@ class SubstackIE(InfoExtractor): 'id': '47660949', 'ext': 'mp4', 'title': 'I MADE A VLOG', - 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6', + 'description': 'md5:9248af9a759321e1027226f988f54d96', 'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18', 'uploader': 'Maybe Baby', 'uploader_id': '33628', @@ -77,7 +77,9 @@ def _real_extract(self, url): display_id, username = self._match_valid_url(url).group('id', 'username') webpage = self._download_webpage(url, display_id) - webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id) + webpage_info = self._parse_json(self._search_json( + r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) post_type = webpage_info['post']['type'] formats, subtitles = [], {} From 971d901d129403e875a04dd92109507a03fbc070 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 07:03:44 -0500 Subject: [PATCH 159/501] [extractor/tencent] Fix fatal metadata extraction (#7219) Closes #7177 Authored by: bashonly --- yt_dlp/extractor/tencent.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py index 42a2175b0f..6618ea4e6e 100644 --- a/yt_dlp/extractor/tencent.py +++ b/yt_dlp/extractor/tencent.py @@ -163,11 +163,9 @@ class VQQBaseIE(TencentBaseIE): _REFERER = 'v.qq.com' def _get_webpage_metadata(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'(?s)<script[^>]*>[^<]*window\.__pinia\s*=\s*([^<]+)</script>', - webpage, 'pinia data', fatal=False), - video_id, transform_source=js_to_json, fatal=False) + return self._search_json( + r'<script[^>]*>[^<]*window\.__(?:pinia|PINIA__)\s*=', + webpage, 'pinia data', video_id, transform_source=js_to_json, fatal=False) class VQQVideoIE(VQQBaseIE): @@ -176,7 +174,7 @@ class VQQVideoIE(VQQBaseIE): _TESTS = [{ 'url': 'https://v.qq.com/x/page/q326831cny0.html', - 'md5': '84568b3722e15e9cd023b5594558c4a7', + 'md5': 'b11c9cb781df710d686b950376676e2a', 'info_dict': { 'id': 'q326831cny0', 'ext': 'mp4', @@ -187,7 +185,7 @@ class VQQVideoIE(VQQBaseIE): }, }, { 'url': 'https://v.qq.com/x/page/o3013za7cse.html', - 'md5': 'cc431c4f9114a55643893c2c8ebf5592', + 'md5': 'a1bcf42c6d28c189bd2fe2d468abb287', 'info_dict': { 'id': 'o3013za7cse', 'ext': 'mp4', @@ -208,6 +206,7 @@ class VQQVideoIE(VQQBaseIE): 'series': '鸡毛飞上天', 'format_id': r're:^shd', }, + 'skip': '404', }, { 'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html', 'md5': 'fadd10bf88aec3420f06f19ee1d24c5b', @@ -220,6 +219,7 @@ class VQQVideoIE(VQQBaseIE): 'series': '青年理工工作者生活研究所', 'format_id': r're:^shd', }, + 'params': {'skip_download': 'm3u8'}, }, { # Geo-restricted to China 'url': 'https://v.qq.com/x/cover/mcv8hkc8zk8lnov/x0036x5qqsr.html', From 5ee9a7d6e18ceea956e831994cf11c423979354f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 07:15:09 -0500 Subject: [PATCH 160/501] [extractor/sverigesradio] Support slug URLs (#7220) Closes #7145 Authored by: bashonly --- yt_dlp/extractor/sverigesradio.py | 62 +++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/sverigesradio.py b/yt_dlp/extractor/sverigesradio.py index 65da615d00..01a07b3995 100644 --- a/yt_dlp/extractor/sverigesradio.py +++ b/yt_dlp/extractor/sverigesradio.py @@ -1,8 +1,13 @@ from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, + get_element_by_id, + get_element_html_by_class, int_or_none, str_or_none, + traverse_obj, + url_or_none, ) @@ -21,7 +26,15 @@ class SverigesRadioBaseIE(InfoExtractor): } def _real_extract(self, url): - audio_id = self._match_id(url) + audio_id, display_id = self._match_valid_url(url).group('id', 'slug') + if not audio_id: + webpage = self._download_webpage(url, display_id) + audio_id = ( + traverse_obj( + get_element_html_by_class('audio-button', webpage), + ({extract_attributes}, ('data-audio-id', 'data-publication-id')), get_all=False) + or self._parse_json(get_element_by_id('gtm-metadata', webpage), display_id)['pageId']) + query = { 'id': audio_id, 'type': self._AUDIO_TYPE, @@ -30,7 +43,6 @@ def _real_extract(self, url): item = self._download_json( self._BASE_URL + 'audiometadata', audio_id, 'Downloading audio JSON metadata', query=query)['items'][0] - title = item['subtitle'] query['format'] = 'iis' urls = [] @@ -61,18 +73,20 @@ def _real_extract(self, url): return { 'id': audio_id, - 'title': title, 'formats': formats, - 'series': item.get('title'), - 'duration': int_or_none(item.get('duration')), - 'thumbnail': item.get('displayimageurl'), - 'description': item.get('description'), + **traverse_obj(item, { + 'title': 'subtitle', + 'series': 'title', + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('displayimageurl', {url_or_none}), + 'description': 'description', + }), } class SverigesRadioPublicationIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:publication' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?(?:artikel|gruppsida)(?:\.aspx\?.*?\bartikel=(?P<id>[0-9]+)|/(?P<slug>[\w-]+))' _TESTS = [{ 'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546', 'md5': '6a4917e1923fccb080e5a206a5afa542', @@ -85,6 +99,18 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): 'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://sverigesradio.se/artikel/tysk-fotbollsfeber-bayern-munchens-10-ariga-segersvit-kan-brytas', + 'md5': 'f8a914ad50f491bb74eed403ab4bfef6', + 'info_dict': { + 'id': '8360345', + 'ext': 'm4a', + 'title': 'Tysk fotbollsfeber när Bayern Münchens 10-åriga segersvit kan brytas', + 'series': 'Radiosporten', + 'description': 'md5:5254610e20ce527ecb3a6102a06dcc5f', + 'duration': 72, + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887', 'only_matching': True, @@ -94,8 +120,8 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): class SverigesRadioEpisodeIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:episode' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?:(?P<id>\d+)|(?P<slug>[\w-]+))(?:$|[#?])' + _TESTS = [{ 'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300', 'md5': '20dc4d8db24228f846be390b0c59a07c', 'info_dict': { @@ -106,6 +132,18 @@ class SverigesRadioEpisodeIE(SverigesRadioBaseIE): 'title': 'Metoo och valen', 'description': 'md5:fcb5c1f667f00badcc702b196f10a27e', 'thumbnail': r're:^https?://.*\.jpg', - } - } + }, + }, { + 'url': 'https://sverigesradio.se/avsnitt/p4-live-med-first-aid-kit-scandinavium-mars-2023', + 'md5': 'ce17fb82520a8033dbb846993d5589fe', + 'info_dict': { + 'id': '2160416', + 'ext': 'm4a', + 'title': 'P4 Live med First Aid Kit', + 'description': 'md5:6d5b78eed3d2b65f6de04daa45e9285d', + 'thumbnail': r're:^https?://.*\.jpg', + 'series': 'P4 Live', + 'duration': 5640, + }, + }] _AUDIO_TYPE = 'episode' From 97d60ad8cd6c99f01e463a9acfce8693aff2a609 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 08:37:59 -0500 Subject: [PATCH 161/501] [extractor/foxnews] Fix extractors (#7222) Closes #6050 Authored by: bashonly --- yt_dlp/extractor/amp.py | 9 +++-- yt_dlp/extractor/foxnews.py | 77 +++++++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/amp.py b/yt_dlp/extractor/amp.py index b0cbd775c0..0d259c549f 100644 --- a/yt_dlp/extractor/amp.py +++ b/yt_dlp/extractor/amp.py @@ -5,6 +5,7 @@ int_or_none, mimetype2ext, parse_iso8601, + strip_jsonp, unified_timestamp, url_or_none, ) @@ -15,7 +16,7 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _extract_feed_info(self, url): feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed') + 'Unable to download Akamai AMP feed', transform_source=strip_jsonp) item = feed.get('channel', {}).get('item') if not item: raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) @@ -73,8 +74,10 @@ def get_media_node(name, default=None): media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py index 52172aacef..6aa63614ef 100644 --- a/yt_dlp/extractor/foxnews.py +++ b/yt_dlp/extractor/foxnews.py @@ -7,8 +7,37 @@ class FoxNewsIE(AMPIE): IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' + _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ + { + 'url': 'https://video.foxnews.com/v/6320653836112', + 'info_dict': { + 'id': '6320653836112', + 'ext': 'mp4', + 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 404, + 'upload_date': '20230217', + 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02', + 'timestamp': 1676611344.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'info_dict': { + 'id': '5099377331001', + 'ext': 'mp4', + 'title': '82416_censoring', + 'description': '82416_censoring', + 'upload_date': '20160826', + 'timestamp': 1472169708.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 521, + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', @@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20110503', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'skip': '404 page', }, { 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', @@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20141204', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': 'm3u8 HTTP error 400 in web browser', }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', @@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, ] @classmethod @@ -67,10 +89,10 @@ def _extract_embed_urls(cls, url, webpage): yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() + video_id = self._match_id(url) info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}') info['id'] = video_id return info @@ -78,6 +100,19 @@ def _real_extract(self, url): class FoxNewsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P<id>\d+)' _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6328632286112', + 'info_dict': { + 'id': '6328632286112', + 'ext': 'mp4', + 'title': 'Review: 2023 Toyota Prius Prime', + 'duration': 155, + 'thumbnail': r're:^https://.+\.jpg$', + 'timestamp': 1685720177.0, + 'upload_date': '20230602', + 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.foxnews.com/video/6313058664112', 'info_dict': { 'id': '6313058664112', @@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor): 'title': 'Gutfeld! - Thursday, September 29', 'timestamp': 1664527538, }, - 'expected_warnings': ['Ignoring subtitle tracks'], - 'params': {'skip_download': 'm3u8'}, + 'skip': '404 page', }] def _real_extract(self, url): @@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor): _TESTS = [{ # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': 'd2dd6ce809cedeefa96460e964821437', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', + 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum', 'timestamp': 1473301045, 'upload_date': '20160908', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 426, }, + 'params': {'skip_download': 'm3u8'}, }, { # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', 'info_dict': { 'id': '5748266721001', 'ext': 'flv', @@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor): 'timestamp': 1520594670, 'upload_date': '20180309', }, - 'params': { - 'skip_download': True, - }, + 'skip': '404 page', }, { 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', 'only_matching': True, From 4815d35c191e7d375b94492a6486dd2ba43a8954 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 08:49:10 -0500 Subject: [PATCH 162/501] [extractor/sonyliv] Fix login with token (#7223) Authored by: bashonly --- yt_dlp/extractor/sonyliv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index aaad420f12..5ebe20df7a 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -10,6 +10,8 @@ from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, + try_call, try_get, ) @@ -77,8 +79,10 @@ def _perform_login(self, username, password): self._HEADERS['device_id'] = self._get_device_id() self._HEADERS['content-type'] = 'application/json' - if username.lower() == 'token' and len(password) > 1198: + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._HEADERS['authorization'] = password + self.report_login() + return elif len(username) != 10 or not username.isdigit(): raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') From 7bc92517463f5766e9d9b92c3823b5cf403c0e3d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 09:07:13 -0500 Subject: [PATCH 163/501] [extractor/shemaroome] Pass `stream_key` header to downloader (#7224) Closes #7133 Authored by: bashonly --- yt_dlp/extractor/shemaroome.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/shemaroome.py b/yt_dlp/extractor/shemaroome.py index 7a78c6e054..ec9938b8cb 100644 --- a/yt_dlp/extractor/shemaroome.py +++ b/yt_dlp/extractor/shemaroome.py @@ -73,7 +73,10 @@ def _real_extract(self, url): key = bytes_to_intlist(compat_b64decode(data_json['key'])) iv = [0] * 16 m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') - formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) + headers = {'stream_key': data_json['stream_key']} + formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers) + for fmt in formats: + fmt['http_headers'] = headers release_date = self._html_search_regex( (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'), From 7f8ddebbb51c9fd4a347306332a718ba41b371b8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 09:19:16 -0500 Subject: [PATCH 164/501] [extractor/hotstar] Support `/shows/` URLs (#7225) Closes #6463 Authored by: bashonly --- yt_dlp/extractor/hotstar.py | 40 +++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index cea1812f15..591e23b8ad 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -83,7 +83,7 @@ class HotStarIE(HotStarBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) (?: - (?P<type>movies|sports|episode|(?P<tv>tv))/ + (?P<type>movies|sports|episode|(?P<tv>tv|shows))/ (?(tv)(?:[^/?#]+/){2}|[^?#]*) )? [^/?#]+/ @@ -122,6 +122,25 @@ class HotStarIE(HotStarBaseIE): 'episode': 'Janhvi Targets Suman', 'episode_number': 8, } + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/anupama-anuj-share-a-moment/1000282843', + 'info_dict': { + 'id': '1000282843', + 'ext': 'mp4', + 'title': 'Anupama, Anuj Share a Moment', + 'season': 'Chapter 1', + 'description': 'md5:8d74ed2248423b8b06d5c8add4d7a0c0', + 'timestamp': 1678149000, + 'channel': 'StarPlus', + 'series': 'Anupama', + 'season_number': 1, + 'season_id': 7399, + 'upload_date': '20230307', + 'episode': 'Anupama, Anuj Share a Moment', + 'episode_number': 853, + 'duration': 1272, + 'channel_id': 3, + }, }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, @@ -139,6 +158,7 @@ class HotStarIE(HotStarBaseIE): 'sports': 'match', 'episode': 'episode', 'tv': 'episode', + 'shows': 'episode', None: 'content', } @@ -304,13 +324,16 @@ def _real_extract(self, url): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { 'id': '3_2_26', }, 'playlist_mincount': 20, + }, { + 'url': 'https://www.hotstar.com/shows/savdhaan-india/s-26/list/popular-clips/t-3_2_26', + 'only_matching': True, }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, @@ -327,7 +350,7 @@ def _real_extract(self, url): class HotStarSeasonIE(HotStarBaseIE): IE_NAME = 'hotstar:season' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', 'info_dict': { @@ -346,6 +369,9 @@ class HotStarSeasonIE(HotStarBaseIE): 'id': '8208', }, 'playlist_mincount': 19, + }, { + 'url': 'https://www.hotstar.com/in/shows/bigg-boss/14714/seasons/season-4/ss-8208/', + 'only_matching': True, }] def _real_extract(self, url): @@ -356,7 +382,7 @@ def _real_extract(self, url): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -375,6 +401,12 @@ class HotStarSeriesIE(HotStarBaseIE): 'id': '435', }, 'playlist_mincount': 267, + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/', + 'info_dict': { + 'id': '1260022017', + }, + 'playlist_mincount': 940, }] def _real_extract(self, url): From c2a1bdb00931969193f2a31ea27b9c66a07aaec2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Jun 2023 09:28:40 -0500 Subject: [PATCH 165/501] [extractor/tiktok] Extract 1080p adaptive formats (#7228) Closes #7109 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 63708229ee..49035e971c 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -62,7 +62,7 @@ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)', 'Accept': 'application/json', }, query=query) @@ -79,11 +79,11 @@ def _build_api_query(self, query, app_version, manifest_app_version): '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', - 'device_type': 'Pixel 4', + 'device_type': 'Pixel 7', 'device_platform': 'android', - 'resolution': '1080*1920', + 'resolution': '1080*2400', 'dpi': 420, - 'os_version': '10', + 'os_version': '13', 'os_api': '29', 'carrier_region': 'US', 'sys_region': 'US', @@ -624,6 +624,32 @@ class TikTokIE(TikTokBaseIE): 'thumbnails': 'count:3', }, 'expected_warnings': ['Unable to find video in feed'], + }, { + # 1080p format + 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', + 'md5': '982512017a8a917124d5a08c8ae79621', + 'info_dict': { + 'id': '7107337212743830830', + 'ext': 'mp4', + 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'uploader': 'tatemcrae', + 'uploader_id': '86328792343818240', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'creator': 't8', + 'artist': 't8', + 'track': 'original sound', + 'upload_date': '20220609', + 'timestamp': 1654805899, + 'duration': 150, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, + 'params': {'format': 'bytevc1_1080p_808907-0'}, }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', From ee0ed0338df328cd986f97315c8162b5a151476d Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Mon, 5 Jun 2023 10:40:48 -0500 Subject: [PATCH 166/501] [extractor/zdf] Fix formats extraction Closes #7238, Closes #7240 Authored by: bashonly --- yt_dlp/extractor/zdf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index c863c46ed7..c04d51b7ea 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -24,7 +24,7 @@ class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'uhd') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') def _call_api(self, url, video_id, item, api_token=None, referrer=None): headers = {} @@ -61,6 +61,9 @@ def _extract_format(self, video_id, formats, format_urls, meta): elif mime_type == 'application/f4m+xml' or ext == 'f4m': new_formats = self._extract_f4m_formats( update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) + elif ext == 'mpd': + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) if not f and meta.get('type'): From 59d9fe08312bbb76ee26238d207a8ca35410a48d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 5 Jun 2023 10:52:45 -0500 Subject: [PATCH 167/501] [extractor/mgtv] Fix formats extraction (#7234) Closes #7008 Authored by: bashonly --- yt_dlp/extractor/mgtv.py | 65 ++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index edc92b371f..06edcb396a 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -1,17 +1,17 @@ import base64 import time +import urllib.error import uuid from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) from ..utils import ( ExtractorError, int_or_none, + parse_resolution, + traverse_obj, try_get, url_or_none, + urljoin, ) @@ -30,16 +30,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15588271.html', 'info_dict': { 'id': '15588271', 'ext': 'mp4', - 'title': '春日迟迟再出发 沉浸版', + 'title': '春日迟迟再出发 沉浸版第1期:陆莹结婚半年查出肾炎被离婚 吴雅婷把一半票根退给前夫', 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', 'thumbnail': r're:^https?://.+\.jpg', 'duration': 4026, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/333652/7329822.html', 'info_dict': { @@ -50,6 +52,7 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'duration': 2656, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15591647.html', 'only_matching': True, @@ -64,6 +67,13 @@ class MGTVIE(InfoExtractor): 'only_matching': True, }] + _RESOLUTIONS = { + '标清': ('480p', '854x480'), + '高清': ('540p', '960x540'), + '超清': ('720p', '1280x720'), + '蓝光': ('1080p', '1920x1080'), + } + def _real_extract(self, url): video_id = self._match_id(url) tk2 = base64.urlsafe_b64encode( @@ -76,55 +86,60 @@ def _real_extract(self, url): 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: error = self._parse_json(e.cause.read().decode(), None) if error.get('code') == 40005: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(error['msg'], expected=True) raise - info = api_data['info'] - title = info['title'].strip() + stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ - 'pm2': api_data['atc']['pm2'], 'tk2': tk2, + 'pm2': api_data['atc']['pm2'], 'video_id': video_id, + 'type': 'pch5', 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] - stream_domain = stream_data['stream_domain'][0] + stream_domain = traverse_obj(stream_data, ('stream_domain', ..., {url_or_none}), get_all=False) formats = [] - for idx, stream in enumerate(stream_data['stream']): - stream_path = stream.get('url') - if not stream_path: - continue - format_data = self._download_json( - stream_domain + stream_path, video_id, - note=f'Download video info for format #{idx}') - format_url = format_data.get('info') + for idx, stream in enumerate(traverse_obj(stream_data, ('stream', lambda _, v: v['url']))): + stream_name = traverse_obj(stream, 'name', 'standardName', 'barName', expected_type=str) + resolution = traverse_obj( + self._RESOLUTIONS, (stream_name, 1 if stream.get('scale') == '16:9' else 0)) + format_url = traverse_obj(self._download_json( + urljoin(stream_domain, stream['url']), video_id, fatal=False, + note=f'Downloading video info for format {resolution or stream_name}'), + ('info', {url_or_none})) if not format_url: continue tbr = int_or_none(stream.get('filebitrate') or self._search_regex( r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ - 'format_id': compat_str(tbr or idx), - 'url': url_or_none(format_url), + 'format_id': str(tbr or idx), + 'url': format_url, 'ext': 'mp4', 'tbr': tbr, + 'vcodec': stream.get('videoFormat'), + 'acodec': stream.get('audioFormat'), + **parse_resolution(resolution), 'protocol': 'm3u8_native', 'http_headers': { 'Referer': url, }, - 'format_note': stream.get('name'), + 'format_note': stream_name, }) return { 'id': video_id, - 'title': title, 'formats': formats, - 'description': info.get('desc'), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('thumb'), + **traverse_obj(api_data, ('info', { + 'title': ('title', {str.strip}), + 'description': ('desc', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumb', {url_or_none}), + })), 'subtitles': self.extract_subtitles(video_id, stream_domain), } From c2b801fea59628d5c873e06a0727fbf2051bbd1f Mon Sep 17 00:00:00 2001 From: stanoarn <74262064+stanoarn@users.noreply.github.com> Date: Wed, 7 Jun 2023 22:18:06 +0200 Subject: [PATCH 168/501] [extractor/rozhlas] `MujRozhlas`: Add extractor (#7129) Authored by: stanoarn --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rozhlas.py | 164 ++++++++++++++++++++++++++++---- 2 files changed, 144 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7120fd37d1..f54024211e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1625,6 +1625,7 @@ from .rozhlas import ( RozhlasIE, RozhlasVltavaIE, + MujRozhlasIE, ) from .rte import RteIE, RteRadioIE from .rtlnl import ( diff --git a/yt_dlp/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py index 5cc664e00b..5f83d42e83 100644 --- a/yt_dlp/extractor/rozhlas.py +++ b/yt_dlp/extractor/rozhlas.py @@ -1,10 +1,15 @@ +import itertools +import urllib.error + from .common import InfoExtractor from ..utils import ( + ExtractorError, extract_attributes, int_or_none, remove_start, str_or_none, traverse_obj, + unified_timestamp, url_or_none, ) @@ -51,7 +56,40 @@ def _real_extract(self, url): } -class RozhlasVltavaIE(InfoExtractor): +class RozhlasBaseIE(InfoExtractor): + def _extract_formats(self, entry, audio_id): + formats = [] + for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): + ext = audio.get('variant') + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, audio_id) + try: + if ext == 'dash': + formats.extend(self._extract_mpd_formats( + audio['url'], audio_id, mpd_id=ext)) + elif ext == 'hls': + formats.extend(self._extract_m3u8_formats( + audio['url'], audio_id, 'm4a', m3u8_id=ext)) + else: + formats.append({ + 'url': audio['url'], + 'ext': ext, + 'format_id': ext, + 'abr': int_or_none(audio.get('bitrate')), + 'acodec': ext, + 'vcodec': 'none', + }) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 429: + retry.error = e.cause + else: + self.report_warning(e.msg) + + return formats + + +class RozhlasVltavaIE(RozhlasBaseIE): _VALID_URL = r'https?://(?:\w+\.rozhlas|english\.radio)\.cz/[\w-]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337', @@ -168,33 +206,14 @@ class RozhlasVltavaIE(InfoExtractor): }] def _extract_video(self, entry): - formats = [] audio_id = entry['meta']['ga']['contentId'] - for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): - ext = audio.get('variant') - if ext == 'dash': - formats.extend(self._extract_mpd_formats( - audio['url'], audio_id, mpd_id=ext, fatal=False)) - elif ext == 'hls': - formats.extend(self._extract_m3u8_formats( - audio['url'], audio_id, 'm4a', m3u8_id=ext, fatal=False)) - else: - formats.append({ - 'url': audio['url'], - 'ext': ext, - 'format_id': ext, - 'abr': int_or_none(audio.get('bitrate')), - 'acodec': ext, - 'vcodec': 'none', - }) - chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none})) return { 'id': audio_id, 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None, 'chapter_number': chapter_number, - 'formats': formats, + 'formats': self._extract_formats(entry, audio_id), **traverse_obj(entry, { 'title': ('meta', 'ga', 'contentName'), 'description': 'title', @@ -219,3 +238,106 @@ def _real_extract(self, url): 'title': traverse_obj(data, ('series', 'title')), 'entries': map(self._extract_video, data['playlist']), } + + +class MujRozhlasIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:www\.)?mujrozhlas\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + # single episode extraction + 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli', + 'md5': '6f8fd68663e64936623e67c152a669e0', + 'info_dict': { + 'id': '10739193', + 'ext': 'mp3', + 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli', + 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908', + 'timestamp': 1684915200, + 'modified_timestamp': 1684922446, + 'series': 'Vykopávky', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg', + 'channel_id': 'radio-wave', + 'upload_date': '20230524', + 'modified_date': '20230524', + }, + }, { + # serial extraction + 'url': 'https://www.mujrozhlas.cz/radiokniha/jaroslava-janackova-pribeh-tajemneho-psani-o-pramenech-genezi-babicky', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'bb2b5f4e-ffb4-35a6-a34a-046aa62d6f6b', + 'title': 'Jaroslava Janáčková: Příběh tajemného psaní. O pramenech a genezi Babičky', + 'description': 'md5:7434d8fac39ac9fee6df098e11dfb1be', + }, + }, { + # show extraction + 'url': 'https://www.mujrozhlas.cz/nespavci', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '09db9b37-d0f4-368c-986a-d3439f741f08', + 'title': 'Nespavci', + 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc', + }, + }] + + def _call_api(self, path, item_id, msg='API JSON'): + return self._download_json( + f'https://api.mujrozhlas.cz/{path}/{item_id}', item_id, + note=f'Downloading {msg}', errnote=f'Failed to download {msg}')['data'] + + def _extract_audio_entry(self, entry): + audio_id = entry['meta']['ga']['contentId'] + + return { + 'id': audio_id, + 'formats': self._extract_formats(entry['attributes'], audio_id), + **traverse_obj(entry, { + 'title': ('attributes', 'title'), + 'description': ('attributes', 'description'), + 'episode_number': ('attributes', 'part'), + 'series': ('attributes', 'mirroredShow', 'title'), + 'chapter': ('attributes', 'mirroredSerial', 'title'), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + 'timestamp': ('attributes', 'since', {unified_timestamp}), + 'modified_timestamp': ('attributes', 'updated', {unified_timestamp}), + 'thumbnail': ('attributes', 'asset', 'url', {url_or_none}), + }) + } + + def _entries(self, api_url, playlist_id): + for page in itertools.count(1): + episodes = self._download_json( + api_url, playlist_id, note=f'Downloading episodes page {page}', + errnote=f'Failed to download episodes page {page}', fatal=False) + for episode in traverse_obj(episodes, ('data', lambda _, v: v['meta']['ga']['contentId'])): + yield self._extract_audio_entry(episode) + api_url = traverse_obj(episodes, ('links', 'next', {url_or_none})) + if not api_url: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info = self._search_json(r'\bvar\s+dl\s*=', webpage, 'info json', display_id) + + entity = info['siteEntityBundle'] + + if entity == 'episode': + return self._extract_audio_entry(self._call_api( + 'episodes', info['contentId'], 'episode info API JSON')) + + elif entity in ('show', 'serial'): + playlist_id = info['contentShow'].split(':')[0] if entity == 'show' else info['contentId'] + data = self._call_api(f'{entity}s', playlist_id, f'{entity} playlist JSON') + api_url = data['relationships']['episodes']['links']['related'] + return self.playlist_result( + self._entries(api_url, playlist_id), playlist_id, + **traverse_obj(data, ('attributes', { + 'title': 'title', + 'description': 'description', + }))) + + else: + # `entity == 'person'` not implemented yet by API, ref: + # https://api.mujrozhlas.cz/persons/8367e456-2a57-379a-91bb-e699619bea49/participation + raise ExtractorError(f'Unsupported entity type "{entity}"') From 14a14335b280766fbf5a469ae26836d6c1fe450a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 8 Jun 2023 18:58:49 +0530 Subject: [PATCH 169/501] [extractor/youtube] Misc cleanup Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 93 ++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6e7485c030..1b12663603 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -292,6 +292,7 @@ class BadgeType(enum.Enum): AVAILABILITY_PREMIUM = enum.auto() AVAILABILITY_SUBSCRIPTION = enum.auto() LIVE_NOW = enum.auto() + VERIFIED = enum.auto() class YoutubeBaseInfoExtractor(InfoExtractor): @@ -791,17 +792,23 @@ def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - def _extract_badges(self, renderer: dict): - privacy_icon_map = { + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, } label_map = { @@ -809,13 +816,13 @@ def _extract_badges(self, renderer: dict): 'private': BadgeType.AVAILABILITY_PRIVATE, 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM + 'premium': BadgeType.AVAILABILITY_PREMIUM, } badges = [] - for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer')): + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): badge_type = ( - privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) or badge_style_map.get(traverse_obj(badge, 'style')) ) if badge_type: @@ -823,11 +830,12 @@ def _extract_badges(self, renderer: dict): continue # fallback, won't work in some languages - label = traverse_obj(badge, 'label', expected_type=str, default='') + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') for match, label_badge_type in label_map.items(): if match in label.lower(): - badges.append({'type': badge_type}) - continue + badges.append({'type': label_badge_type}) + break return badges @@ -1020,8 +1028,7 @@ def _extract_video(self, renderer): overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) - badges = self._extract_badges(renderer) - + badges = self._extract_badges(traverse_obj(renderer, 'badges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -1079,7 +1086,7 @@ def _extract_video(self, renderer): needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, - 'live_status': live_status + 'live_status': live_status, } @@ -1332,6 +1339,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1415,6 +1423,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'The Witcher', 'uploader_url': 'https://www.youtube.com/@thewitcher', 'uploader_id': '@thewitcher', + 'comment_count': int, + 'heatmap': 'count:100', }, }, { @@ -1894,6 +1904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Bernie Sanders', 'uploader_url': 'https://www.youtube.com/@BernieSanders', 'uploader_id': '@BernieSanders', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1955,6 +1966,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Vsauce', 'uploader_url': 'https://www.youtube.com/@Vsauce', 'uploader_id': '@Vsauce', + 'comment_count': int, }, 'params': { 'skip_download': True, @@ -2147,6 +2159,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'kudvenkat', 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2227,6 +2240,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'CBS Mornings', 'uploader_url': 'https://www.youtube.com/@CBSMornings', 'uploader_id': '@CBSMornings', + 'comment_count': int, } }, { @@ -2297,6 +2311,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'colinfurze', 'uploader_url': 'https://www.youtube.com/@colinfurze', 'uploader_id': '@colinfurze', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': { 'format': '17', # 3gp format available on android @@ -2342,6 +2358,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'SciShow', 'uploader_url': 'https://www.youtube.com/@SciShow', 'uploader_id': '@SciShow', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2370,6 +2388,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', } }, { # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date @@ -2398,6 +2417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Leon Nguyen', 'uploader_url': 'https://www.youtube.com/@LeonNguyen', 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', }, 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { @@ -2428,6 +2448,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Quackity', 'uploader_id': '@Quackity', 'uploader_url': 'https://www.youtube.com/@Quackity', + 'comment_count': int, + 'heatmap': 'count:100', } }, { # continuous livestream. Microformat upload date should be preferred. @@ -2594,6 +2616,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'MrBeast', 'uploader_url': 'https://www.youtube.com/@MrBeast', 'uploader_id': '@MrBeast', + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -2655,6 +2679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'さなちゃんねる', 'uploader_url': 'https://www.youtube.com/@sana_natori', 'uploader_id': '@sana_natori', + 'heatmap': 'count:100', }, }, { @@ -2684,6 +2709,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.webp', 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', 'playable_in_embed': True, + 'comment_count': int, + 'heatmap': 'count:100', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, @@ -2720,6 +2747,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Christopher Sykes', 'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries', 'uploader_id': '@ChristopherSykesDocumentaries', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -3312,10 +3340,9 @@ def _extract_comment(self, comment_renderer, parent=None): if comment_abr is not None: info['is_favorited'] = 'creatorHeart' in comment_abr - comment_ab_icontype = traverse_obj( - comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType')) - if comment_ab_icontype is not None: - info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE') + badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) + if self._has_badge(badges, BadgeType.VERIFIED): + info['author_is_verified'] = True is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') if is_pinned: @@ -4481,7 +4508,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if v: info[d_k] = v - badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + badges = self._extract_badges(traverse_obj(vpir, 'badges')) is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or get_first(video_details, 'isPrivate', expected_type=bool)) @@ -4554,13 +4581,14 @@ def _extract_channel_renderer(self, renderer): channel_id = self.ucid_or_none(renderer['channelId']) title = self._get_text(renderer, 'title') channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) - # As of 2023-03-01 YouTube doesn't use the channel handles on these renderers yet. - # However we can expect them to change that in the future. channel_handle = self.handle_from_url( traverse_obj(renderer, ( 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl')), {str}), get_all=False)) + if not channel_handle: + # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search + channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) return { '_type': 'url', 'url': channel_url, @@ -4573,9 +4601,15 @@ def _extract_channel_renderer(self, renderer): 'title': title, 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - 'channel_follower_count': self._get_count(renderer, 'subscriberCountText'), + # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. + # However, in feed/channels this is set correctly to the subscriber count + 'channel_follower_count': traverse_obj( + renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), - 'playlist_count': self._get_count(renderer, 'videoCountText'), + 'playlist_count': ( + # videoCountText may be the subscriber count + self._get_count(renderer, 'videoCountText') + if self._get_count(renderer, 'subscriberCountText') is not None else None), 'description': self._get_text(renderer, 'descriptionSnippet'), } @@ -5100,7 +5134,7 @@ def _extract_availability(self, data): playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} player_header_privacy = playlist_header_renderer.get('privacy') - badges = self._extract_badges(sidebar_renderer) + badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge privacy_setting_icon = get_first( @@ -5350,7 +5384,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], - 'channel_follower_count': int + 'channel_follower_count': int, }, }, { 'note': 'playlists, singlepage', @@ -5690,7 +5724,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'AlTsmyW4auo', # This will keep changing + 'id': 'hGkQjiJLjWQ', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -6202,7 +6236,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': str, 'uploader': str, 'uploader_url': str, - 'uploader_id': str + 'uploader_id': str, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6865,12 +6899,14 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', 'title': 'Kurzgesagt – In a Nutshell', 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', - 'playlist_count': int, # XXX: should have a way of saying > 1 + # No longer available for search as it is set to the handle. + # 'playlist_count': int, 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', 'thumbnails': list, 'uploader_id': '@kurzgesagt', 'uploader_url': 'https://www.youtube.com/@kurzgesagt', 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_follower_count': int, } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -7134,6 +7170,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'live_status': 'not_live', 'channel_follower_count': int, 'chapters': 'count:20', + 'comment_count': int, + 'heatmap': 'count:100', } }] @@ -7194,6 +7232,7 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'channel': 'さなちゃんねる', 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', 'uploader': 'さなちゃんねる', + 'heatmap': 'count:100', }, 'add_ie': ['Youtube'], 'params': {'skip_download': 'Youtube'}, From 8213ce28a485e200f6a7e1af1434a987c8e702bd Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Thu, 8 Jun 2023 19:50:05 +1200 Subject: [PATCH 170/501] [extractor/youtube] Extract `channel_is_verified` (#7213) Authored by: coletdjnz --- README.md | 1 + yt_dlp/extractor/common.py | 1 + yt_dlp/extractor/youtube.py | 38 +++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/README.md b/README.md index 3d89c0af94..ce555c66f0 100644 --- a/README.md +++ b/README.md @@ -1292,6 +1292,7 @@ # OUTPUT TEMPLATE - `channel` (string): Full name of the channel the video is uploaded on - `channel_id` (string): Id of the channel - `channel_follower_count` (numeric): Number of followers of the channel + - `channel_is_verified` (boolean): Whether the channel is verified on the platform - `location` (string): Physical location where the video was filmed - `duration` (numeric): Length of the video in seconds - `duration_string` (string): Length of the video (HH:mm:ss) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index fa46a5240f..ca2164a5db 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -286,6 +286,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1b12663603..47ad1da76c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -803,12 +803,15 @@ def _extract_badges(self, badge_list: list): 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED } label_map = { @@ -817,6 +820,8 @@ def _extract_badges(self, badge_list: list): 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED } badges = [] @@ -1029,6 +1034,7 @@ def _extract_video(self, renderer): renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) badges = self._extract_badges(traverse_obj(renderer, 'badges')) + owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -1087,6 +1093,7 @@ def _extract_video(self, renderer): is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, 'live_status': live_status, + 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None } @@ -1424,6 +1431,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@thewitcher', 'uploader_id': '@thewitcher', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, }, @@ -1454,6 +1462,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', 'uploader_id': '@FlyingKitty900', 'comment_count': int, + 'channel_is_verified': True, }, }, { @@ -1587,6 +1596,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Olympics', 'uploader_url': 'https://www.youtube.com/@Olympics', 'uploader_id': '@Olympics', + 'channel_is_verified': True, }, 'params': { 'skip_download': 'requires avconv', @@ -1904,6 +1914,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Bernie Sanders', 'uploader_url': 'https://www.youtube.com/@BernieSanders', 'uploader_id': '@BernieSanders', + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -1967,6 +1978,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@Vsauce', 'uploader_id': '@Vsauce', 'comment_count': int, + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -2159,6 +2171,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'kudvenkat', 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -2241,6 +2254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@CBSMornings', 'uploader_id': '@CBSMornings', 'comment_count': int, + 'channel_is_verified': True, } }, { @@ -2312,6 +2326,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@colinfurze', 'uploader_id': '@colinfurze', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -2359,6 +2374,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@SciShow', 'uploader_id': '@SciShow', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { @@ -2449,6 +2465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@Quackity', 'uploader_url': 'https://www.youtube.com/@Quackity', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', } }, @@ -2617,6 +2634,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@MrBeast', 'uploader_id': '@MrBeast', 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, @@ -2679,6 +2697,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'さなちゃんねる', 'uploader_url': 'https://www.youtube.com/@sana_natori', 'uploader_id': '@sana_natori', + 'channel_is_verified': True, 'heatmap': 'count:100', }, }, @@ -2710,6 +2729,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', 'playable_in_embed': True, 'comment_count': int, + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'params': { @@ -4483,6 +4503,9 @@ def process_language(container, base_url, lang_code, sub_name, query): info['artist'] = mrr_contents_text elif mrr_title == 'Song': info['track'] = mrr_contents_text + owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges'))) + if self._has_badge(owner_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True info.update({ 'uploader': info.get('channel'), @@ -4611,6 +4634,8 @@ def _extract_channel_renderer(self, renderer): self._get_count(renderer, 'videoCountText') if self._get_count(renderer, 'subscriberCountText') is not None else None), 'description': self._get_text(renderer, 'descriptionSnippet'), + 'channel_is_verified': True if self._has_badge( + self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None, } def _grid_entries(self, grid_renderer): @@ -5026,6 +5051,10 @@ def _get_uncropped(url): 'uploader_id': channel_handle, 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), }) + + channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False)) + if self._has_badge(channel_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True # Playlist stats is a text runs array containing [video count, view count, last updated]. # last updated or (view count and last updated) may be missing. playlist_stats = get_first( @@ -5385,6 +5414,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], 'channel_follower_count': int, + 'channel_is_verified': True, }, }, { 'note': 'playlists, singlepage', @@ -5561,6 +5591,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', @@ -5748,6 +5779,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@SkyNews', 'uploader_id': '@SkyNews', 'uploader': 'Sky News', + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -6237,6 +6269,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': str, 'uploader_url': str, 'uploader_id': str, + 'channel_is_verified': bool, # this will keep changing } }], 'params': {'extract_flat': True, 'playlist_items': '1'}, @@ -6272,6 +6305,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'PewDiePie', 'uploader_url': 'https://www.youtube.com/@PewDiePie', 'uploader_id': '@PewDiePie', + 'channel_is_verified': True, } }], 'params': {'extract_flat': True}, @@ -6290,6 +6324,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, 'playlist_count': 0, }, { @@ -6324,6 +6359,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'description': 'I make music', 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', 'channel_follower_count': int, + 'channel_is_verified': True, }, 'playlist_mincount': 10, }] @@ -6906,6 +6942,7 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@kurzgesagt', 'uploader_url': 'https://www.youtube.com/@kurzgesagt', 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_is_verified': True, 'channel_follower_count': int, } }], @@ -7232,6 +7269,7 @@ class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): 'channel': 'さなちゃんねる', 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', 'uploader': 'さなちゃんねる', + 'channel_is_verified': True, 'heatmap': 'count:100', }, 'add_ie': ['Youtube'], From 44c0d66442b568d9e1359e669d8b029b08a77fa7 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 8 Jun 2023 13:36:09 -0500 Subject: [PATCH 171/501] [extractor/lbry] Extract original quality formats (#7257) Closes #7251 Authored by: bashonly --- yt_dlp/extractor/lbry.py | 129 ++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index b5def1e071..23d3daf13e 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -1,8 +1,8 @@ import functools import json +import urllib.parse from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote from ..utils import ( ExtractorError, HEADRequest, @@ -12,7 +12,10 @@ int_or_none, mimetype2ext, parse_qs, + traverse_obj, try_get, + url_or_none, + urlhandle_detect_ext, urljoin, ) @@ -52,38 +55,25 @@ def _permanent_url(self, url, claim_name, claim_id): '/%s:%s' % (claim_name, claim_id)) def _parse_stream(self, stream, url): - stream_value = stream.get('value') or {} - stream_type = stream_value.get('stream_type') - source = stream_value.get('source') or {} - media = stream_value.get(stream_type) or {} - signing_channel = stream.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = self._permanent_url(url, channel_name, channel_claim_id) + stream_type = traverse_obj(stream, ('value', 'stream_type', {str})) + + info = traverse_obj(stream, { + 'title': ('value', 'title', {str}), + 'thumbnail': ('value', 'thumbnail', 'url', {url_or_none}), + 'description': ('value', 'description', {str}), + 'license': ('value', 'license', {str}), + 'timestamp': ('timestamp', {int_or_none}), + 'release_timestamp': ('value', 'release_time', {int_or_none}), + 'tags': ('value', 'tags', ..., {lambda x: x or None}), + 'duration': ('value', stream_type, 'duration', {int_or_none}), + 'channel': ('signing_channel', 'value', 'title', {str}), + 'channel_id': ('signing_channel', 'claim_id', {str}), + }) + + channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) + if channel_name and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) - info = { - 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': stream_value.get('description'), - 'license': stream_value.get('license'), - 'timestamp': int_or_none(stream.get('timestamp')), - 'release_timestamp': int_or_none(stream_value.get('release_time')), - 'tags': stream_value.get('tags'), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), - } - if stream_type == 'audio': - info['vcodec'] = 'none' - else: - info.update({ - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - }) return info @@ -186,6 +176,28 @@ class LBRYIE(LBRYBaseIE): 'license': 'None', }, 'params': {'skip_download': True} + }, { + # original quality format w/higher resolution than HLS formats + 'url': 'https://odysee.com/@wickedtruths:2/Biotechnological-Invasion-of-Skin-(April-2023):4', + 'md5': '305b0b3b369bde1b984961f005b67193', + 'info_dict': { + 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', + 'ext': 'mp4', + 'title': 'Biotechnological Invasion of Skin (April 2023)', + 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'channel': 'Wicked Truths', + 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'timestamp': 1685790036, + 'upload_date': '20230603', + 'release_timestamp': 1685617473, + 'release_date': '20230601', + 'duration': 1063, + 'thumbnail': 'https://thumbs.odycdn.com/4e6d39da4df0cfdad45f64e253a15959.webp', + 'tags': ['smart skin surveillance', 'biotechnology invasion of skin', 'morgellons'], + 'license': 'None', + 'protocol': 'https', # test for direct mp4 download + }, }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -221,41 +233,64 @@ def _real_extract(self, url): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') - display_id = compat_urllib_parse_unquote(display_id) + display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') headers = {'Referer': 'https://odysee.com/'} - if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + + formats = [] + stream_type = traverse_obj(result, ('value', 'stream_type', {str})) + + if stream_type in self._SUPPORTED_STREAM_TYPES: claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + + # GET request returns original video/audio file if available + ext = urlhandle_detect_ext(self._request_webpage( + streaming_url, display_id, 'Checking for original quality', headers=headers)) + if ext != 'm3u8': + formats.append({ + 'url': streaming_url, + 'format_id': 'original', + 'quality': 1, + **traverse_obj(result, ('value', { + 'ext': ('source', (('name', {determine_ext}), ('media_type', {mimetype2ext}))), + 'filesize': ('source', 'size', {int_or_none}), + 'width': ('video', 'width', {int_or_none}), + 'height': ('video', 'height', {int_or_none}), + }), get_all=False), + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + # HEAD request returns redirect response to m3u8 URL if available final_url = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, note='Downloading streaming redirect url info').geturl() + elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True live_data = self._download_json( 'https://api.odysee.live/livestream/is_live', claim_id, query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('VideoURL') + final_url = live_data.get('VideoURL') # Upcoming videos may still give VideoURL if not live_data.get('Live'): - streaming_url = final_url = None + final_url = None self.raise_no_formats('This stream is not live', True, claim_id) + else: raise UnsupportedError(url) - info = self._parse_stream(result, url) if determine_ext(final_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) - else: - info['url'] = streaming_url + formats.extend(self._extract_m3u8_formats( + final_url, display_id, 'mp4', m3u8_id='hls', live=is_live, headers=headers)) + return { - **info, + **self._parse_stream(result, url), 'id': claim_id, - 'title': result['value']['title'], + 'formats': formats, 'is_live': is_live, 'http_headers': headers, } @@ -299,14 +334,12 @@ def _fetch_page(self, claim_id, url, params, page): if not (stream_claim_name and stream_claim_id): continue - info = self._parse_stream(item, url) - info.update({ + yield { + **self._parse_stream(item, url), '_type': 'url', 'id': stream_claim_id, - 'title': try_get(item, lambda x: x['value']['title']), 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - }) - yield info + } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') From d1795f4a6af99c976c9d3ea2dabe5cf4f8965d3c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 8 Jun 2023 13:47:13 -0500 Subject: [PATCH 172/501] [extractor/twitter] Add login support (#7258) Closes #6951 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 213 +++++++++++++++++++++++++++++++++--- 1 file changed, 198 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4624ce5035..f854d9c4a4 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -3,7 +3,6 @@ from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE -from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -30,11 +29,67 @@ class TwitterBaseIE(InfoExtractor): + _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} _guest_token = None + _flow_token = None + + _LOGIN_INIT_DATA = json.dumps({ + 'input_flow_data': { + 'flow_context': { + 'debug_overrides': {}, + 'start_location': { + 'location': 'unknown' + } + } + }, + 'subtask_versions': { + 'action_list': 2, + 'alert_dialog': 1, + 'app_download_cta': 1, + 'check_logged_in_account': 1, + 'choice_selection': 3, + 'contacts_live_sync_permission_prompt': 0, + 'cta': 7, + 'email_verification': 2, + 'end_flow': 1, + 'enter_date': 1, + 'enter_email': 2, + 'enter_password': 5, + 'enter_phone': 2, + 'enter_recaptcha': 1, + 'enter_text': 5, + 'enter_username': 2, + 'generic_urt': 3, + 'in_app_notification': 1, + 'interest_picker': 3, + 'js_instrumentation': 1, + 'menu_dialog': 1, + 'notifications_permission_prompt': 2, + 'open_account': 2, + 'open_home_timeline': 1, + 'open_link': 1, + 'phone_verification': 4, + 'privacy_options': 1, + 'security_key': 3, + 'select_avatar': 4, + 'select_banner': 2, + 'settings_list': 7, + 'show_code': 1, + 'sign_up': 2, + 'sign_up_review': 4, + 'tweet_selection_urt': 1, + 'update_users': 1, + 'upload_media': 1, + 'user_recommendations_list': 4, + 'user_recommendations_urt': 1, + 'wait_spinner': 3, + 'web_modal': 1 + } + }, separators=(',', ':')).encode() def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -86,18 +141,151 @@ def _search_dimensions_in_video_url(a_format, video_url): 'height': int(m.group('height')), }) - @functools.cached_property + @property def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _call_api(self, path, video_id, query={}, graphql=False): - cookies = self._get_cookies(self._API_BASE) + def _fetch_guest_token(self, headers, display_id): + headers.pop('x-guest-token', None) + self._guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, + 'Downloading guest token', data=b'', headers=headers), 'guest_token') + if not self._guest_token: + raise ExtractorError('Could not retrieve guest token') + + def _set_base_headers(self): headers = self._AUTH.copy() + csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) + if csrf_token: + headers['x-csrf-token'] = csrf_token + return headers - csrf_cookie = cookies.get('ct0') - if csrf_cookie: - headers['x-csrf-token'] = csrf_cookie.value + def _call_login_api(self, note, headers, query={}, data=None): + response = self._download_json( + f'{self._API_BASE}onboarding/task.json', None, note, + headers=headers, query=query, data=data, expected_status=400) + error = traverse_obj(response, ('errors', 0, 'message', {str})) + if error: + raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True) + elif traverse_obj(response, 'status') != 'success': + raise ExtractorError('Login was unsuccessful') + subtask = traverse_obj( + response, ('subtasks', ..., 'subtask_id', {str}), get_all=False) + if not subtask: + raise ExtractorError('Twitter API did not return next login subtask') + + self._flow_token = response['flow_token'] + + return subtask + + def _perform_login(self, username, password): + if self.is_logged_in: + return + + self._request_webpage('https://twitter.com/', None, 'Requesting cookies') + headers = self._set_base_headers() + self._fetch_guest_token(headers, None) + headers.update({ + 'content-type': 'application/json', + 'x-guest-token': self._guest_token, + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + 'Referer': 'https://twitter.com/', + 'Origin': 'https://twitter.com', + }) + + def build_login_json(*subtask_inputs): + return json.dumps({ + 'flow_token': self._flow_token, + 'subtask_inputs': subtask_inputs + }, separators=(',', ':')).encode() + + def input_dict(subtask_id, text): + return { + 'subtask_id': subtask_id, + 'enter_text': { + 'text': text, + 'link': 'next_link' + } + } + + next_subtask = self._call_login_api( + 'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA) + + while not self.is_logged_in: + if next_subtask == 'LoginJsInstrumentationSubtask': + next_subtask = self._call_login_api( + 'Submitting JS instrumentation response', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'js_instrumentation': { + 'response': '{}', + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterUserIdentifierSSO': + next_subtask = self._call_login_api( + 'Submitting username', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'settings_list': { + 'setting_responses': [{ + 'key': 'user_identifier', + 'response_data': { + 'text_data': { + 'result': username + } + } + }], + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterAlternateIdentifierSubtask': + next_subtask = self._call_login_api( + 'Submitting alternate identifier', headers, + data=build_login_json(input_dict(next_subtask, self._get_tfa_info( + 'one of username, phone number or email that was not used as --username')))) + + elif next_subtask == 'LoginEnterPassword': + next_subtask = self._call_login_api( + 'Submitting password', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'enter_password': { + 'password': password, + 'link': 'next_link' + } + })) + + elif next_subtask == 'AccountDuplicationCheck': + next_subtask = self._call_login_api( + 'Submitting account duplication check', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'check_logged_in_account': { + 'link': 'AccountDuplicationCheck_false' + } + })) + + elif next_subtask == 'LoginTwoFactorAuthChallenge': + next_subtask = self._call_login_api( + 'Submitting 2FA token', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('two-factor authentication token')))) + + elif next_subtask == 'LoginAcid': + next_subtask = self._call_login_api( + 'Submitting confirmation code', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + + elif next_subtask == 'LoginSuccessSubtask': + raise ExtractorError('Twitter API did not grant auth token cookie') + + else: + raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"') + + self.report_login() + + def _call_api(self, path, video_id, query={}, graphql=False): + headers = self._set_base_headers() if self.is_logged_in: headers.update({ 'x-twitter-auth-type': 'OAuth2Session', @@ -106,15 +294,10 @@ def _call_api(self, path, video_id, query={}, graphql=False): }) for first_attempt in (True, False): - if not self.is_logged_in and not self._guest_token: - headers.pop('x-guest-token', None) - self._guest_token = traverse_obj(self._download_json( - f'{self._API_BASE}guest/activate.json', video_id, - 'Downloading guest token', data=b'', headers=headers), 'guest_token') - if self._guest_token: + if not self.is_logged_in: + if not self._guest_token: + self._fetch_guest_token(headers, video_id) headers['x-guest-token'] = self._guest_token - elif not self.is_logged_in: - raise ExtractorError('Could not retrieve guest token') allowed_status = {400, 401, 403, 404} if graphql else {403} result = self._download_json( From 4f7b11cc1c1cebf598107e00cd7295588ed484da Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 Jun 2023 15:43:22 -0500 Subject: [PATCH 173/501] [extractor/voot] Fix extractor (#7227) Closes #6715 Authored by: bashonly --- yt_dlp/extractor/voot.py | 177 ++++++++++++++++++++++++++------------- 1 file changed, 119 insertions(+), 58 deletions(-) diff --git a/yt_dlp/extractor/voot.py b/yt_dlp/extractor/voot.py index b709b74e28..dd41647aa9 100644 --- a/yt_dlp/extractor/voot.py +++ b/yt_dlp/extractor/voot.py @@ -1,14 +1,86 @@ +import json +import time +import urllib.error +import uuid + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + float_or_none, int_or_none, + jwt_decode_hs256, + parse_age_limit, + traverse_obj, + try_call, try_get, - unified_timestamp, + unified_strdate, ) -class VootIE(InfoExtractor): +class VootBaseIE(InfoExtractor): + _NETRC_MACHINE = 'voot' + _GEO_BYPASS = False + _LOGIN_HINT = 'Log in with "-u <email_address> -p <password>", or use "-u token -p <auth_token>" to login with auth token.' + _TOKEN = None + _EXPIRY = 0 + _API_HEADERS = {'Origin': 'https://www.voot.com', 'Referer': 'https://www.voot.com/'} + + def _perform_login(self, username, password): + if self._TOKEN and self._EXPIRY: + return + + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): + VootBaseIE._TOKEN = password + VootBaseIE._EXPIRY = jwt_decode_hs256(password)['exp'] + self.report_login() + + # Mobile number as username is not supported + elif not username.isdigit(): + check_username = self._download_json( + 'https://userauth.voot.com/usersV3/v3/checkUser', None, data=json.dumps({ + 'type': 'email', + 'email': username + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Checking username', expected_status=403) + if not traverse_obj(check_username, ('isExist', {bool})): + if traverse_obj(check_username, ('status', 'code', {int})) == 9999: + self.raise_geo_restricted(countries=['IN']) + raise ExtractorError('Incorrect username', expected=True) + auth_token = traverse_obj(self._download_json( + 'https://userauth.voot.com/usersV3/v3/login', None, data=json.dumps({ + 'type': 'traditional', + 'deviceId': str(uuid.uuid4()), + 'deviceBrand': 'PC/MAC', + 'data': { + 'email': username, + 'password': password + } + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Logging in', expected_status=400), ('data', 'authToken', {dict})) + if not auth_token: + raise ExtractorError('Incorrect password', expected=True) + VootBaseIE._TOKEN = auth_token['accessToken'] + VootBaseIE._EXPIRY = auth_token['expirationTime'] + + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) + + def _check_token_expiry(self): + if int(time.time()) >= self._EXPIRY: + raise ExtractorError('Access token has expired', expected=True) + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(self._LOGIN_HINT, method=None) + self._check_token_expiry() + + +class VootIE(VootBaseIE): _VALID_URL = r'''(?x) (?: voot:| @@ -20,27 +92,25 @@ class VootIE(InfoExtractor): ) (?P<id>\d{3,}) ''' - _GEO_COUNTRIES = ['IN'] _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '0_8ledb18o', + 'id': '441353', 'ext': 'mp4', - 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'title': 'Is this the end of Kamini?', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', - 'timestamp': 1472162937, + 'timestamp': 1472103000, 'upload_date': '20160825', 'series': 'Ishq Ka Rang Safed', 'season_number': 1, 'episode': 'Is this the end of Kamini?', 'episode_number': 340, - 'view_count': int, - 'like_count': int, + 'release_date': '20160825', + 'season': 'Season 1', + 'age_limit': 13, + 'duration': 1146.0, }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', 'only_matching': True, @@ -55,59 +125,50 @@ class VootIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) media_info = self._download_json( - 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, - query={ - 'platform': 'Web', - 'pId': 2, - 'mediaId': video_id, - }) + 'https://psapi.voot.com/jio/voot/v1/voot-web/content/query/asset-details', video_id, + query={'ids': f'include:{video_id}', 'responseType': 'common'}, headers={'accesstoken': self._TOKEN}) - status_code = try_get(media_info, lambda x: x['status']['code'], int) - if status_code != 0: - raise ExtractorError(media_info['status']['message'], expected=True) + try: + m3u8_url = self._download_json( + 'https://vootapi.media.jio.com/playback/v1/playbackrights', video_id, + 'Downloading playback JSON', data=b'{}', headers={ + **self.geo_verification_headers(), + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + 'platform': 'androidwebdesktop', + 'vootid': video_id, + 'voottoken': self._TOKEN, + })['m3u8'] + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 400: + self._check_token_expiry() + raise - media = media_info['assets'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._remove_duplicate_formats(formats) - entry_id = media['EntryId'] - title = media['MediaName'] - formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, - video_id, 'mp4', m3u8_id='hls') - - description, series, season_number, episode, episode_number = [None] * 5 - - for meta in try_get(media, lambda x: x['Metas'], list) or []: - key, value = meta.get('Key'), meta.get('Value') - if not key or not value: - continue - if key == 'ContentSynopsis': - description = value - elif key == 'RefSeriesTitle': - series = value - elif key == 'RefSeriesSeason': - season_number = int_or_none(value) - elif key == 'EpisodeMainTitle': - episode = value - elif key == 'EpisodeNo': - episode_number = int_or_none(value) return { - 'extractor_key': 'Kaltura', - 'id': entry_id, - 'title': title, - 'description': description, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'timestamp': unified_timestamp(media.get('CreationDate')), - 'duration': int_or_none(media.get('Duration')), - 'view_count': int_or_none(media.get('ViewCounter')), - 'like_count': int_or_none(media.get('like_counter')), - 'formats': formats, + 'id': video_id, + # '/_definst_/smil:vod/' m3u8 manifests claim to have 720p+ formats but max out at 480p + 'formats': traverse_obj(formats, ( + lambda _, v: '/_definst_/smil:vod/' not in v['url'] or v['height'] <= 480)), + 'http_headers': self._API_HEADERS, + **traverse_obj(media_info, ('result', 0, { + 'title': ('fullTitle', {str}), + 'description': ('fullSynopsis', {str}), + 'series': ('showName', {str}), + 'season_number': ('season', {int_or_none}), + 'episode': ('fullTitle', {str}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('uploadTime', {int_or_none}), + 'release_date': ('telecastDate', {unified_strdate}), + 'age_limit': ('ageNemonic', {parse_age_limit}), + 'duration': ('duration', {float_or_none}), + })), } -class VootSeriesIE(InfoExtractor): +class VootSeriesIE(VootBaseIE): _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P<id>\d{3,})' _TESTS = [{ 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002', From b4a252fba81f53631c07ca40ce7583f5d19a8a36 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 Jun 2023 17:49:12 -0500 Subject: [PATCH 174/501] [jsinterp] Fix division (#7279) * Fixes nsig decryption for Youtube JS player `8c7583ff` Authored by: bashonly --- test/test_jsinterp.py | 7 +++++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 4d44e6efe6..b01477e6ff 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -28,6 +28,13 @@ def test_basic(self): def test_calc(self): self._test('function f(a){return 2*a+1;}', 7, args=[3]) + def test_div(self): + jsi = JSInterpreter('function f(a, b){return a / b;}') + self.assertTrue(math.isnan(jsi.call_function('f', 0, 0))) + self.assertTrue(math.isnan(jsi.call_function('f', JS_Undefined, 1))) + self.assertTrue(math.isinf(jsi.call_function('f', 2, 0))) + self.assertEqual(jsi.call_function('f', 0, 3), 0) + def test_empty_return(self): self._test('function f(){return; y()}', None) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 13120d97f8..01f09de88c 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -150,6 +150,10 @@ 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w', ), + ( + 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', + '1wWCVpRR96eAmMI87L', 'KSkWAVv1ZQxC3A', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 7c7940efd5..d6d555733a 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -44,7 +44,7 @@ def wrapped(a, b): def _js_div(a, b): - if JS_Undefined in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a or b): return float('nan') return (a or 0) / b if b else float('inf') From f8ae441501596733e2b967430471643a1d7cacb8 Mon Sep 17 00:00:00 2001 From: DataGhost <site.github@dataghost.com> Date: Sun, 11 Jun 2023 17:17:26 +0200 Subject: [PATCH 175/501] [extractor/Dumpert] Fix m3u8 and support new URL pattern (#6091) Authored by: DataGhost, pukkandan Closes #5032 --- yt_dlp/extractor/dumpert.py | 49 +++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) mode change 100644 => 100755 yt_dlp/extractor/dumpert.py diff --git a/yt_dlp/extractor/dumpert.py b/yt_dlp/extractor/dumpert.py old mode 100644 new mode 100755 index 010c2d092d..0cf84263c3 --- a/yt_dlp/extractor/dumpert.py +++ b/yt_dlp/extractor/dumpert.py @@ -1,12 +1,17 @@ from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, qualities, ) class DumpertIE(InfoExtractor): - _VALID_URL = r'(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P<id>[0-9]+[/_][0-9a-zA-Z]+)' + _VALID_URL = r'''(?x) + (?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl(?: + /(?:mediabase|embed|item)/| + (?:/toppers|/latest|/?)\?selectedId= + )(?P<id>[0-9]+[/_][0-9a-zA-Z]+)''' _TESTS = [{ 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', @@ -16,6 +21,9 @@ class DumpertIE(InfoExtractor): 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 9, + 'view_count': int, + 'like_count': int, } }, { 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', @@ -26,6 +34,28 @@ class DumpertIE(InfoExtractor): }, { 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/item/100031688_b317a185', + 'info_dict': { + 'id': '100031688/b317a185', + 'ext': 'mp4', + 'title': 'Epic schijnbeweging', + 'description': '<p>Die zag je niet eh</p>', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'duration': 12, + 'view_count': int, + 'like_count': int, + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://www.dumpert.nl/toppers?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/latest?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,18 +66,23 @@ def _real_extract(self, url): title = item['title'] media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') - quality = qualities(['flv', 'mobile', 'tablet', '720p']) + quality = qualities(['flv', 'mobile', 'tablet', '720p', '1080p']) formats = [] for variant in media.get('variants', []): uri = variant.get('uri') if not uri: continue version = variant.get('version') - formats.append({ - 'url': uri, - 'format_id': version, - 'quality': quality(version), - }) + preference = quality(version) + if determine_ext(uri) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', m3u8_id=version, quality=preference)) + else: + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': preference, + }) thumbnails = [] stills = item.get('stills') or {} From 1a2eb5bda51d8b7a78a65acebf72a0dcf9da196b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 11 Jun 2023 12:06:34 -0500 Subject: [PATCH 176/501] [extractor/odnoklassniki] Fix formats extraction (#7217) Closes #2959, Closes #4462, Closes #7201 Authored by: bashonly --- yt_dlp/extractor/odnoklassniki.py | 56 ++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 4b73eed37e..0d0ad0bb86 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -1,3 +1,5 @@ +import urllib.parse + from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -7,6 +9,7 @@ ) from ..utils import ( ExtractorError, + HEADRequest, float_or_none, int_or_none, qualities, @@ -15,6 +18,7 @@ unescapeHTML, unified_strdate, unsmuggle_url, + url_or_none, urlencode_postdata, ) @@ -41,7 +45,7 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1545580896, 'view_count': int, - 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'title': 'Народная забава', 'uploader': 'Nevata', 'upload_date': '20181223', @@ -65,13 +69,14 @@ class OdnoklassnikiIE(InfoExtractor): 'title': str, 'uploader': str, }, + 'skip': 'vk extractor error', }, { - # metadata in JSON + # metadata in JSON, webm_dash with Firefox UA 'url': 'http://ok.ru/video/20079905452', - 'md5': '5d2b64756e2af296e3b383a0bc02a6aa', + 'md5': '8f477d8931c531374a3e36daec617b2c', 'info_dict': { 'id': '20079905452', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Культура меняет нас (прекрасный ролик!))', 'thumbnail': str, 'duration': 100, @@ -81,10 +86,14 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + 'params': { + 'format': 'bv[ext=webm]', + 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'}, + }, }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': 'f8c951122516af72e6e6ffdd3c41103b', + 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -98,10 +107,11 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': 0, 'start_time': 5, }, + 'params': {'skip_download': 'm3u8'}, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'https://ok.ru/video/3952212382174', - 'md5': '91749d0bd20763a28d083fa335bbd37a', + 'md5': '5fb5f83ce16cb212d6bf887282b5da53', 'info_dict': { 'id': '5axVgHHDBvU', 'ext': 'mp4', @@ -116,7 +126,7 @@ class OdnoklassnikiIE(InfoExtractor): 'live_status': 'not_live', 'view_count': int, 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8', - 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94', + 'uploader_url': 'https://www.youtube.com/@MrKewlkid94', 'channel_follower_count': int, 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'], 'channel_id': 'UCVGtvURtEURYHtJFUegdSug', @@ -145,7 +155,6 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { - # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading 'note': 'Only available in mobile webpage', 'url': 'https://m.ok.ru/video/2361249957145', 'info_dict': { @@ -153,8 +162,8 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Быковское крещение', 'duration': 3038.181, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', }, - 'skip': 'HTTP Error 400', }, { 'note': 'subtitles', 'url': 'https://ok.ru/video/4249587550747', @@ -226,6 +235,14 @@ class OdnoklassnikiIE(InfoExtractor): 'skip': 'Site no longer embeds', }] + def _clear_cookies(self, cdn_url): + # Direct http downloads will fail if CDN cookies are set + # so we need to reset them after each format extraction + if self._get_cookies('https://notarealsubdomain.mycdn.me/'): + self.cookiejar.clear(domain='.mycdn.me') + if self._get_cookies(cdn_url): + self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) + @classmethod def _extract_embed_urls(cls, url, webpage): for x in super()._extract_embed_urls(url, webpage): @@ -364,14 +381,22 @@ def _extract_desktop(self, url): formats = [{ 'url': f['url'], 'ext': 'mp4', - 'format_id': f['name'], - } for f in metadata['videos']] + 'format_id': f.get('name'), + } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))] - m3u8_url = metadata.get('hlsManifestUrl') + m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) + + for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]: + mpd_url = metadata.get(mpd_key) + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, video_id, mpd_id=mpd_id, fatal=False)) + self._clear_cookies(mpd_url) dash_manifest = metadata.get('metadataEmbedded') if dash_manifest: @@ -390,6 +415,7 @@ def _extract_desktop(self, url): if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) rtmp_url = metadata.get('rtmpUrl') if rtmp_url: formats.append({ @@ -423,6 +449,10 @@ def _extract_mobile(self, url): r'data-video="(.+?)"', webpage, 'json data') json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + redirect_url = self._request_webpage(HEADRequest( + json_data['videoSrc']), video_id, 'Requesting download URL').geturl() + self._clear_cookies(redirect_url) + return { 'id': video_id, 'title': json_data.get('videoName'), @@ -430,7 +460,7 @@ def _extract_mobile(self, url): 'thumbnail': json_data.get('videoPosterSrc'), 'formats': [{ 'format_id': 'mobile', - 'url': json_data.get('videoSrc'), + 'url': redirect_url, 'ext': 'mp4', }] } From 9d7fde89a40360396f0baa2ee8bf507f92108b32 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 11 Jun 2023 12:15:05 -0500 Subject: [PATCH 177/501] [extractor/zee5] Fix extraction of new content (#7280) Authored by: bashonly --- yt_dlp/extractor/zee5.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index a64eb9ed0d..b4734cc8f1 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -1,14 +1,16 @@ import json -import random -import string +import time +import uuid from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, parse_age_limit, str_or_none, + try_call, try_get, unified_strdate, unified_timestamp, @@ -94,12 +96,12 @@ class Zee5IE(InfoExtractor): 'url': 'https://www.zee5.com/music-videos/details/adhento-gaani-vunnapaatuga-jersey-nani-shraddha-srinath/0-0-56973', 'only_matching': True }] - _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' - _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') + _DEVICE_ID = str(uuid.uuid4()) _USER_TOKEN = None _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' _NETRC_MACHINE = 'zee5' _GEO_COUNTRIES = ['IN'] + _USER_COUNTRY = None def _perform_login(self, username, password): if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: @@ -118,11 +120,16 @@ def _perform_login(self, username, password): self._USER_TOKEN = otp_verify_json.get('token') if not self._USER_TOKEN: raise ExtractorError(otp_request_json['message'], expected=True) - elif username.lower() == 'token' and len(password) > 1198: + elif username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._USER_TOKEN = password else: raise ExtractorError(self._LOGIN_HINT, expected=True) + token = jwt_decode_hs256(self._USER_TOKEN) + if token.get('exp', 0) <= int(time.time()): + raise ExtractorError('User token has expired', expected=True) + self._USER_COUNTRY = token.get('current_country') + def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') access_token_request = self._download_json( @@ -137,8 +144,13 @@ def _real_extract(self, url): data['X-Z5-Guest-Token'] = self._DEVICE_ID json_data = self._download_json( - self._DETAIL_API_URL.format(video_id, self._DEVICE_ID), - video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) + 'https://spapi.zee5.com/singlePlayback/getDetails/secure', video_id, query={ + 'content_id': video_id, + 'device_id': self._DEVICE_ID, + 'platform_name': 'desktop_web', + 'country': self._USER_COUNTRY or self.get_param('geo_bypass_country') or 'IN', + 'check_parental_control': False, + }, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) asset_data = json_data['assetDetails'] show_data = json_data.get('showDetails', {}) if 'premium' in asset_data['business_type']: From ab6057ec80aa75db6303b8206916d00c376c622c Mon Sep 17 00:00:00 2001 From: puc9 <51006296+puc9@users.noreply.github.com> Date: Sun, 11 Jun 2023 11:57:59 -0700 Subject: [PATCH 178/501] [extractor/tiktok] Fix resolution extraction (#7237) Authored by: puc9 --- yt_dlp/extractor/tiktok.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 49035e971c..9c6d74007d 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -218,8 +218,8 @@ def mp3_meta(url): def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) if res: - known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height')) - known_resolutions[res].setdefault('width', add_meta.get('width')) + known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height')) + known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width')) parsed_meta.update(known_resolutions.get(res, {})) add_meta.setdefault('height', int_or_none(res[:-1])) return [{ From 8790ea7b2536332777bce68590386b1aa935fac7 Mon Sep 17 00:00:00 2001 From: linsui <36977733+linsui@users.noreply.github.com> Date: Mon, 12 Jun 2023 08:02:50 +0000 Subject: [PATCH 179/501] [extractor/ximalaya] Sort playlist entries (#7292) Authored by: linsui --- yt_dlp/extractor/ximalaya.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index ff18ba6975..3d5e6cf90b 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -158,7 +158,7 @@ def _fetch_page(self, playlist_id, page_idx): return self._download_json( 'https://www.ximalaya.com/revision/album/v1/getTracksList', playlist_id, note=f'Downloading tracks list page {page_idx}', - query={'albumId': playlist_id, 'pageNum': page_idx, 'sort': 1})['data'] + query={'albumId': playlist_id, 'pageNum': page_idx})['data'] def _get_entries(self, page_data): for e in page_data['tracks']: From 345b4c0aedd9d19898ce00d5cef35fe0d277a052 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 12 Jun 2023 14:12:09 -0400 Subject: [PATCH 180/501] [extractor/zaiko] Add extractor (#7254) Closes #7196 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/zaiko.py | 92 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 yt_dlp/extractor/zaiko.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f54024211e..921b7dee90 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2441,6 +2441,7 @@ from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE +from .zaiko import ZaikoIE from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py new file mode 100644 index 0000000000..59fc64c5a9 --- /dev/null +++ b/yt_dlp/extractor/zaiko.py @@ -0,0 +1,92 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + str_or_none, + traverse_obj, + unescapeHTML, + url_or_none, +) + + +class ZaikoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?zaiko\.io/event/(?P<id>\d+)/stream(?:/\d+)+' + _TESTS = [{ + 'url': 'https://zaiko.io/event/324868/stream/20571/20571', + 'info_dict': { + 'id': '324868', + 'ext': 'mp4', + 'title': 'ZAIKO STREAMING TEST', + 'alt_title': '[VOD] ZAIKO STREAMING TEST_20210603(Do Not Delete)', + 'uploader_id': '454', + 'uploader': 'ZAIKO ZERO', + 'release_timestamp': 1583809200, + 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', + 'release_date': '20200310', + 'categories': ['Tech House'], + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _parse_vue_element_attr(self, name, string, video_id): + page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name) + attrs = {} + for key, value in extract_attributes(page_elem).items(): + if key.startswith(':'): + attrs[key[1:]] = self._parse_json( + value, video_id, transform_source=unescapeHTML, fatal=False) + return attrs + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + final_url = urlh.geturl() + if 'zaiko.io/login' in final_url: + self.raise_login_required() + elif '/_buy/' in final_url: + raise ExtractorError('Your account does not have tickets to this event', expected=True) + stream_meta = self._parse_vue_element_attr('stream-page', webpage, video_id) + + player_page = self._download_webpage( + stream_meta['stream-access']['video_source'], video_id, + 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) + player_meta = self._parse_vue_element_attr('player', player_page, video_id) + status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + live_status, msg, expected = { + 'vod': ('was_live', 'No VOD stream URL was found', False), + 'archiving': ('post_live', 'Event VOD is still being processed', True), + 'deleting': ('post_live', 'This event has ended', True), + 'deleted': ('post_live', 'This event has ended', True), + 'error': ('post_live', 'This event has ended', True), + 'disconnected': ('post_live', 'Stream has been disconnected', True), + 'live_to_disconnected': ('post_live', 'Stream has been disconnected', True), + 'live': ('is_live', 'No livestream URL found was found', False), + 'waiting': ('is_upcoming', 'Live event has not yet started', True), + 'cancelled': ('not_live', 'Event has been cancelled', True), + }.get(status) or ('not_live', f'Unknown event status "{status}"', False) + + stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( + stream_url, video_id, live=True, fatal=False) if stream_url else [] + if not formats: + self.raise_no_formats(msg, expected=expected) + + return { + 'id': video_id, + 'formats': formats, + 'live_status': live_status, + **traverse_obj(stream_meta, { + 'title': ('event', 'name', {str}), + 'uploader': ('profile', 'name', {str}), + 'uploader_id': ('profile', 'id', {str_or_none}), + 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), + 'categories': ('event', 'genres', ..., {lambda x: x or None}), + }), + **traverse_obj(player_meta, ('initial_event_info', { + 'alt_title': ('title', {str}), + 'thumbnail': ('poster_url', {url_or_none}), + })), + } From cab94a0cd8b6d3fffed5a6faff030274adbed182 Mon Sep 17 00:00:00 2001 From: Cyberes <64224601+Cyberes@users.noreply.github.com> Date: Mon, 12 Jun 2023 21:23:17 -0600 Subject: [PATCH 181/501] [extractor/funker530] Add extractor (#7291) Authored by: Cyberes --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/funker530.py | 79 +++++++++++++++++++++++++++++++++ yt_dlp/extractor/rumble.py | 2 +- 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/funker530.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 921b7dee90..69c7a9e90a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -664,6 +664,7 @@ FunimationShowIE, ) from .funk import FunkIE +from .funker530 import Funker530IE from .fusion import FusionIE from .fuyintv import FuyinTVIE from .gab import ( diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py new file mode 100644 index 0000000000..ba5ab7d4ee --- /dev/null +++ b/yt_dlp/extractor/funker530.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from .rumble import RumbleEmbedIE +from .youtube import YoutubeIE +from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none + + +class Funker530IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', + 'md5': '085f50fea27523a388bbc22e123e09c8', + 'info_dict': { + 'id': 'v2qbmu4', + 'ext': 'mp4', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, + 'upload_date': '20230608', + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + } + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + if rumble_url: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: + raise ExtractorError('No videos found on webpage', expected=True) + + return { + **info, + '_type': 'url_transparent', + 'description': strip_or_none(self._search_regex( + r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), + 'description', default=None)) + } diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 98f660f8b6..82f3f0f8c2 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] + r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) From c8561c6d03f025268d6d3972abeb47987c8d7cbb Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Tue, 13 Jun 2023 15:49:18 -0500 Subject: [PATCH 182/501] [extractor/wrestleuniverse] Fix cookies support Closes #7298 Authored by: bashonly --- yt_dlp/extractor/wrestleuniverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 946edf20a4..b12b0f0a9e 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -41,7 +41,7 @@ def _TOKEN(self): token = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) if not token and not self._REFRESH_TOKEN: self.raise_login_required() - self._REAL_TOKEN = token + self._TOKEN = token if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()): if not self._REFRESH_TOKEN: From 7bcd4813215ac98daa4949af2ffc677c78307a38 Mon Sep 17 00:00:00 2001 From: hoaluvn <hoaluvn@users.noreply.github.com> Date: Wed, 14 Jun 2023 17:52:17 +0200 Subject: [PATCH 183/501] [extractor/urplay] Extract all subtitles (#7309) Authored by: hoaluvn --- yt_dlp/extractor/urplay.py | 25 +++++++++++++------------ yt_dlp/utils/_utils.py | 1 + 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/urplay.py b/yt_dlp/extractor/urplay.py index 5d69dadd67..7f97fc95f5 100644 --- a/yt_dlp/extractor/urplay.py +++ b/yt_dlp/extractor/urplay.py @@ -112,18 +112,19 @@ def parse_lang_code(code): lang = ISO639Utils.short2long(lang) return lang or None - for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): - if (k in ('sd', 'hd') or not isinstance(v, dict)): - continue - lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) - if not sttl_url: - continue - lang = parse_lang_code(lang) - if not lang: - continue - sttl = subtitles.get(lang) or [] - sttl.append({'ext': k, 'url': sttl_url, }) - subtitles[lang] = sttl + for stream in urplayer_data['streamingInfo'].values(): + for k, v in stream.items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl image = urplayer_data.get('image') or {} thumbnails = [] diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 4179d58c16..6462101165 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -4147,6 +4147,7 @@ class ISO639Utils: 'or': 'ori', 'os': 'oss', 'pa': 'pan', + 'pe': 'per', 'pi': 'pli', 'pl': 'pol', 'ps': 'pus', From 6daaf21092888beff11b807cd46f832f1f9c46a0 Mon Sep 17 00:00:00 2001 From: RjY <rjy@users.sourceforge.net> Date: Wed, 14 Jun 2023 19:40:06 +0100 Subject: [PATCH 184/501] [extractor/discogs] Add extractor (#6624) Authored by: rjy --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/discogs.py | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 yt_dlp/extractor/discogs.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 69c7a9e90a..e4fd944e78 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -520,6 +520,7 @@ DeuxMNewsIE ) from .digitalconcerthall import DigitalConcertHallIE +from .discogs import DiscogsReleasePlaylistIE from .discovery import DiscoveryIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE diff --git a/yt_dlp/extractor/discogs.py b/yt_dlp/extractor/discogs.py new file mode 100644 index 0000000000..048c62288c --- /dev/null +++ b/yt_dlp/extractor/discogs.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class DiscogsReleasePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discogs\.com/(?P<type>release|master)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.discogs.com/release/1-The-Persuader-Stockholm', + 'info_dict': { + 'id': 'release1', + 'title': 'Stockholm', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.discogs.com/master/113-Vince-Watson-Moments-In-Time', + 'info_dict': { + 'id': 'master113', + 'title': 'Moments In Time', + }, + 'playlist_mincount': 53, + }] + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + display_id = f'{playlist_type}{playlist_id}' + response = self._download_json( + f'https://api.discogs.com/{playlist_type}s/{playlist_id}', display_id) + + entries = [ + self.url_result(video['uri'], YoutubeIE, video_title=video.get('title')) + for video in traverse_obj(response, ('videos', lambda _, v: YoutubeIE.suitable(v['uri'])))] + + return self.playlist_result(entries, display_id, response.get('title')) From 83465fc4100a2fb2c188898fbc2f3021f6a9b4dd Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Wed, 14 Jun 2023 12:54:06 -0600 Subject: [PATCH 185/501] [extractor/ettutv] Add extractor (#6579) Closes #6359 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ettutv.py | 60 +++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 yt_dlp/extractor/ettutv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e4fd944e78..10e132b4b8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -579,6 +579,7 @@ ESPNCricInfoIE, ) from .esri import EsriVideoIE +from .ettutv import EttuTvIE from .europa import EuropaIE, EuroParlWebstreamIE from .europeantour import EuropeanTourIE from .eurosport import EurosportIE diff --git a/yt_dlp/extractor/ettutv.py b/yt_dlp/extractor/ettutv.py new file mode 100644 index 0000000000..46d7255438 --- /dev/null +++ b/yt_dlp/extractor/ettutv.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import bool_or_none, traverse_obj, unified_timestamp, url_or_none + + +class EttuTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ettu\.tv/[^?#]+/playerpage/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.ettu.tv/en-int/playerpage/1573849', + 'md5': '5874b7639a2aa866d1f6c3a4037c7c09', + 'info_dict': { + 'id': '1573849', + 'title': 'Ni Xia Lian - Shao Jieni', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677348600, + 'upload_date': '20230225', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.ettu.tv/en-int/playerpage/1573753', + 'md5': '1fc094bf96cf2d5ec0f434d3a6dec9aa', + 'info_dict': { + 'id': '1573753', + 'title': 'Qiu Dang - Jorgic Darko', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677423600, + 'upload_date': '20230226', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_settings = self._download_json( + f'https://www.ettu.tv/api/v3/contents/{video_id}/player-settings', video_id, query={ + 'language': 'en', + 'showTitle': 'true', + 'device': 'desktop', + }) + + stream_response = self._download_json(player_settings['streamAccess'], video_id, data={}) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream_response['data']['stream'], video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(player_settings, { + 'title': 'title', + 'description': ('metaInformation', 'competition'), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('date', {unified_timestamp}), + 'is_live': ('isLivestream', {bool_or_none}), + }) + } From fdd69db38924c38194ef236b26325d66ac815c88 Mon Sep 17 00:00:00 2001 From: "Jeong, Heon" <blmarket@gmail.com> Date: Wed, 14 Jun 2023 15:01:18 -0400 Subject: [PATCH 186/501] [extractor/afreecatv] Fix extractor (#6283) Closes #6133 Authored by: blmarket --- yt_dlp/extractor/afreecatv.py | 103 +++++++++------------------------- 1 file changed, 27 insertions(+), 76 deletions(-) diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 9276fe7997..3d26d9c25d 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -76,59 +76,6 @@ class AfreecaTVIE(InfoExtractor): }, }], 'skip': 'Video is gone', - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', - 'info_dict': { - 'id': '18650793', - 'ext': 'mp4', - 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '윈아디', - 'uploader_id': 'badkids', - 'duration': 107, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', - 'info_dict': { - 'id': '10481652', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'duration': 6492, - }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '20160502_c4c62b9d_174361386_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 3601, - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '20160502_39e739bb_174361386_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 2891, - }, - }], - 'params': { - 'skip_download': True, - }, }, { # non standard key 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', @@ -146,8 +93,8 @@ class AfreecaTVIE(InfoExtractor): 'skip_download': True, }, }, { - # PARTIAL_ADULT - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + # adult content + 'url': 'https://vod.afreecatv.com/player/97267690', 'info_dict': { 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', @@ -161,16 +108,25 @@ class AfreecaTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['adult content'], + 'skip': 'The VOD does not exist', }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', - 'only_matching': True, - }, { - 'url': 'http://vod.afreecatv.com/player/15055030', - 'only_matching': True, + 'url': 'https://vod.afreecatv.com/player/96753363', + 'info_dict': { + 'id': '20230108_9FF5BEE1_244432674_1', + 'ext': 'mp4', + 'uploader_id': 'rlantnghks', + 'uploader': '페이즈으', + 'duration': 10840, + 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r', + 'upload_date': '20230108', + 'title': '젠지 페이즈', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -223,26 +179,21 @@ def _perform_login(self, username, password): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if re.search(r'alert\(["\']This video has been deleted', webpage): - raise ExtractorError( - 'Video %s has been deleted' % video_id, expected=True) - - station_id = self._search_regex( - r'nStationNo\s*=\s*(\d+)', webpage, 'station') - bbs_id = self._search_regex( - r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') - video_id = self._search_regex( - r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - partial_view = False adult_view = False for _ in range(2): + data = self._download_json( + 'https://api.m.afreecatv.com/station/video/a/view', + video_id, headers={'Referer': url}, data=urlencode_postdata({ + 'nTitleNo': video_id, + 'nApiLevel': 10, + }))['data'] + if traverse_obj(data, ('code', {int})) == -6221: + raise ExtractorError('The VOD does not exist', expected=True) query = { 'nTitleNo': video_id, - 'nStationNo': station_id, - 'nBbsNo': bbs_id, + 'nStationNo': data['station_no'], + 'nBbsNo': data['bbs_no'], } if partial_view: query['partialView'] = 'SKIP_ADULT' From f9213f8a2d7ba46b912afe1dd3ce6bb700a33d72 Mon Sep 17 00:00:00 2001 From: foreignBlade <136548235+foreignBlade@users.noreply.github.com> Date: Thu, 15 Jun 2023 06:56:26 -0400 Subject: [PATCH 187/501] [extractor/stripchat] Fix extractor (#7306) Closes #7305 Authored by: foreignBlade --- yt_dlp/extractor/stripchat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/stripchat.py b/yt_dlp/extractor/stripchat.py index 4d2fb06084..b9523c8654 100644 --- a/yt_dlp/extractor/stripchat.py +++ b/yt_dlp/extractor/stripchat.py @@ -42,14 +42,13 @@ def _real_extract(self, url): elif not traverse_obj(data, ('viewCam', 'model', 'isLive'), expected_type=bool): raise UserNotLive(video_id=video_id) - server = traverse_obj(data, ('viewCam', 'viewServers', 'flashphoner-hls'), expected_type=str) model_id = traverse_obj(data, ('viewCam', 'model', 'id'), expected_type=int) formats = [] for host in traverse_obj(data, ('config', 'data', ( (('features', 'featuresV2'), 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))): formats = self._extract_m3u8_formats( - f'https://b-{server}.{host}/hls/{model_id}/master/{model_id}_auto.m3u8', + f'https://edge-hls.{host}/hls/{model_id}/master/{model_id}_auto.m3u8', video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) if formats: break From 125ffaa1737dd04716f2f6fbb0595ad3eb7a4b1c Mon Sep 17 00:00:00 2001 From: TxI5 <92522534+TxI5@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:57:25 +0200 Subject: [PATCH 188/501] [extractor/tv4] Fix extractor (#5649) Closes #5535 Authored by: TxI5, dirkf --- yt_dlp/extractor/tv4.py | 81 ++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/tv4.py b/yt_dlp/extractor/tv4.py index 1378a6f574..10a2fe6e27 100644 --- a/yt_dlp/extractor/tv4.py +++ b/yt_dlp/extractor/tv4.py @@ -2,8 +2,11 @@ from .common import InfoExtractor from ..utils import ( + bool_or_none, int_or_none, parse_iso8601, + traverse_obj, + url_or_none, ) @@ -20,19 +23,25 @@ class TV4IE(InfoExtractor): sport/| ) )(?P<id>[0-9]+)''' - _GEO_COUNTRIES = ['SE'] + _GEO_BYPASS = False _TESTS = [ { + # not geo-restricted 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '2491650', 'ext': 'mp4', 'title': 'Kalla Fakta 5 (english subtitles)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': int, + 'description': '2491650', + 'series': 'Kalla fakta', + 'duration': 1335, + 'thumbnail': r're:^https?://[^/?#]+/api/v2/img/', + 'timestamp': 1385373240, 'upload_date': '20131125', }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', @@ -46,6 +55,7 @@ class TV4IE(InfoExtractor): 'timestamp': int, 'upload_date': '20150130', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.tv4play.se/sport/3060959', @@ -69,29 +79,28 @@ class TV4IE(InfoExtractor): } ] - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'https://playback-api.b17g.net/asset/%s' % video_id, - video_id, 'Downloading video info JSON', query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls,dash', - 'drm': 'widevine', - })['metadata'] - - title = info['title'] - - manifest_url = self._download_json( - 'https://playback-api.b17g.net/media/' + video_id, - video_id, query={ + def _call_api(self, endpoint, video_id, headers=None, query={}): + return self._download_json( + f'https://playback2.a2d.tv/{endpoint}/{video_id}', video_id, + f'Downloading {endpoint} API JSON', headers=headers, query={ 'service': 'tv4', 'device': 'browser', 'protocol': 'hls', - })['playbackItem']['manifestUrl'] - formats = [] - subtitles = {} + **query, + }) + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = traverse_obj(self._call_api('asset', video_id, query={ + 'protocol': 'hls,dash', + 'drm': 'widevine', + }), ('metadata', {dict})) or {} + + manifest_url = self._call_api( + 'play', video_id, headers=self.geo_verification_headers())['playbackItem']['manifestUrl'] + + formats, subtitles = [], {} fmts, subs = self._extract_m3u8_formats_and_subtitles( manifest_url, video_id, 'mp4', @@ -117,20 +126,24 @@ def _real_extract(self, url): subtitles = self._merge_subtitles(subtitles, subs) if not formats and info.get('is_geo_restricted'): - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + self.raise_geo_restricted( + 'This video is not available from your location due to geo-restriction, or not being authenticated', + countries=['SE']) return { 'id': video_id, - 'title': title, 'formats': formats, 'subtitles': subtitles, - 'description': info.get('description'), - 'timestamp': parse_iso8601(info.get('broadcast_date_time')), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('image'), - 'is_live': info.get('isLive') is True, - 'series': info.get('seriesTitle'), - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode': info.get('episodeTitle'), - 'episode_number': int_or_none(info.get('episodeNumber')), + **traverse_obj(info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': (('broadcast_date_time', 'broadcastDateTime'), {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'is_live': ('isLive', {bool_or_none}), + 'series': ('seriesTitle', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('episodeTitle', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + }, get_all=False), } From 0a5d7c39e17bb9bd50c9db42bcad40eb82d7f784 Mon Sep 17 00:00:00 2001 From: toomyzoom <52140413+toomyzoom@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:23:01 -0700 Subject: [PATCH 189/501] [extractor/iwara] Fix authentication (#7137) Closes #7035, Closes #7207 Authored by: toomyzoom --- yt_dlp/extractor/iwara.py | 149 ++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 64 deletions(-) diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index bdc39a7ddb..e23fdfd6ad 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,68 +1,83 @@ import functools import urllib.parse +import urllib.error import hashlib import json +import time from .common import InfoExtractor from ..utils import ( ExtractorError, OnDemandPagedList, int_or_none, + jwt_decode_hs256, mimetype2ext, qualities, traverse_obj, + try_call, unified_timestamp, ) -# https://github.com/yt-dlp/yt-dlp/issues/6671 class IwaraBaseIE(InfoExtractor): + _NETRC_MACHINE = 'iwara' _USERTOKEN = None _MEDIATOKEN = None - _NETRC_MACHINE = 'iwara' - def _get_user_token(self, invalidate=False): - if not invalidate and self._USERTOKEN: - return self._USERTOKEN + def _is_token_expired(self, token, token_type): + # User token TTL == ~3 weeks, Media token TTL == ~1 hour + if (try_call(lambda: jwt_decode_hs256(token)['exp']) or 0) <= int(time.time() - 120): + self.to_screen(f'{token_type} token has expired') + return True + def _get_user_token(self): username, password = self._get_login_info() - IwaraBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username) - if not IwaraBaseIE._USERTOKEN or invalidate: - IwaraBaseIE._USERTOKEN = self._download_json( + if not username or not password: + return + + user_token = IwaraBaseIE._USERTOKEN or self.cache.load(self._NETRC_MACHINE, username) + if not user_token or self._is_token_expired(user_token, 'User'): + response = self._download_json( 'https://api.iwara.tv/user/login', None, note='Logging in', - data=json.dumps({ + headers={'Content-Type': 'application/json'}, data=json.dumps({ 'email': username, 'password': password - }).encode('utf-8'), - headers={ + }).encode(), expected_status=lambda x: True) + user_token = traverse_obj(response, ('token', {str})) + if not user_token: + error = traverse_obj(response, ('message', {str})) + if 'invalidLogin' in error: + raise ExtractorError('Invalid login credentials', expected=True) + else: + raise ExtractorError(f'Iwara API said: {error or "nothing"}') + + self.cache.store(self._NETRC_MACHINE, username, user_token) + + IwaraBaseIE._USERTOKEN = user_token + + def _get_media_token(self): + self._get_user_token() + if not IwaraBaseIE._USERTOKEN: + return # user has not passed credentials + + if not IwaraBaseIE._MEDIATOKEN or self._is_token_expired(IwaraBaseIE._MEDIATOKEN, 'Media'): + IwaraBaseIE._MEDIATOKEN = self._download_json( + 'https://api.iwara.tv/user/token', None, note='Fetching media token', + data=b'', headers={ + 'Authorization': f'Bearer {IwaraBaseIE._USERTOKEN}', 'Content-Type': 'application/json' - })['token'] + })['accessToken'] - self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) + return {'Authorization': f'Bearer {IwaraBaseIE._MEDIATOKEN}'} - return self._USERTOKEN - - def _get_media_token(self, invalidate=False): - if not invalidate and self._MEDIATOKEN: - return self._MEDIATOKEN - - IwaraBaseIE._MEDIATOKEN = self._download_json( - 'https://api.iwara.tv/user/token', None, note='Fetching media token', - data=b'', # Need to have some data here, even if it's empty - headers={ - 'Authorization': f'Bearer {self._get_user_token()}', - 'Content-Type': 'application/json' - })['accessToken'] - - return self._MEDIATOKEN + def _perform_login(self, username, password): + self._get_media_token() class IwaraIE(IwaraBaseIE): IE_NAME = 'iwara' _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - # this video cannot be played because of migration - 'only_matching': True, 'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq', 'info_dict': { 'id': 'k2ayoueezfkx6gvq', @@ -79,25 +94,29 @@ class IwaraIE(IwaraBaseIE): 'timestamp': 1677843869, 'modified_timestamp': 1679056362, }, + 'skip': 'this video cannot be played because of migration', }, { 'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/', - 'md5': '20691ce1473ec2766c0788e14c60ce66', + 'md5': '7645f966f069b8ec9210efd9130c9aad', 'info_dict': { 'id': '1ywe1sbkqwumpdxz5', 'ext': 'mp4', 'age_limit': 18, - 'title': 'Aponia 阿波尼亚SEX Party Tonight 手动脱衣 大奶 裸腿', - 'description': 'md5:0c4c310f2e0592d68b9f771d348329ca', - 'uploader': '龙也zZZ', + 'title': 'Aponia アポニア SEX Party Tonight 手の脱衣 巨乳 ', + 'description': 'md5:3f60016fff22060eef1ef26d430b1f67', + 'uploader': 'Lyu ya', 'uploader_id': 'user792540', 'tags': [ 'uncategorized' ], - 'like_count': 1809, - 'view_count': 25156, - 'comment_count': 1, + 'like_count': int, + 'view_count': int, + 'comment_count': int, 'timestamp': 1678732213, - 'modified_timestamp': 1679110271, + 'modified_timestamp': int, + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/581d12b5-46f4-4f15-beb2-cfe2cde5d13d/thumbnail-00.jpg', + 'modified_date': '20230614', + 'upload_date': '20230313', }, }, { 'url': 'https://iwara.tv/video/blggmfno8ghl725bg', @@ -112,12 +131,15 @@ class IwaraIE(IwaraBaseIE): 'tags': [ 'pee' ], - 'like_count': 192, - 'view_count': 12119, - 'comment_count': 0, + 'like_count': int, + 'view_count': int, + 'comment_count': int, 'timestamp': 1598880567, - 'modified_timestamp': 1598908995, - 'availability': 'needs_auth', + 'modified_timestamp': int, + 'upload_date': '20200831', + 'modified_date': '20230605', + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/7693e881-d302-42a4-a780-f16d66b5dadd/thumbnail-00.jpg', + # 'availability': 'needs_auth', }, }] @@ -142,17 +164,16 @@ def _extract_formats(self, video_id, fileurl): def _real_extract(self, url): video_id = self._match_id(url) - username, password = self._get_login_info() - headers = { - 'Authorization': f'Bearer {self._get_media_token()}', - } if username and password else None - video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True, headers=headers) + username, _ = self._get_login_info() + video_data = self._download_json( + f'https://api.iwara.tv/video/{video_id}', video_id, + expected_status=lambda x: True, headers=self._get_media_token()) errmsg = video_data.get('message') # at this point we can actually get uploaded user info, but do we need it? if errmsg == 'errors.privateVideo': - self.raise_login_required('Private video. Login if you have permissions to watch') + self.raise_login_required('Private video. Login if you have permissions to watch', method='password') elif errmsg == 'errors.notFound' and not username: - self.raise_login_required('Video may need login to view') + self.raise_login_required('Video may need login to view', method='password') elif errmsg: # None if success raise ExtractorError(f'Iwara says: {errmsg}') @@ -181,15 +202,6 @@ def _real_extract(self, url): 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))), } - def _perform_login(self, username, password): - if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token(): - self.write_debug('Skipping logging in') - return - - IwaraBaseIE._USERTOKEN = self._get_user_token(True) - self._get_media_token(True) - self.cache.store(self._NETRC_MACHINE, username, IwaraBaseIE._USERTOKEN) - class IwaraUserIE(IwaraBaseIE): _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P<id>[^/?#&]+)' @@ -200,12 +212,14 @@ class IwaraUserIE(IwaraBaseIE): 'url': 'https://iwara.tv/profile/user792540/videos', 'info_dict': { 'id': 'user792540', + 'title': 'Lyu ya', }, - 'playlist_mincount': 80, + 'playlist_mincount': 70, }, { 'url': 'https://iwara.tv/profile/theblackbirdcalls/videos', 'info_dict': { 'id': 'theblackbirdcalls', + 'title': 'TheBlackbirdCalls', }, 'playlist_mincount': 723, }, { @@ -214,6 +228,13 @@ class IwaraUserIE(IwaraBaseIE): }, { 'url': 'https://iwara.tv/profile/theblackbirdcalls', 'only_matching': True, + }, { + 'url': 'https://www.iwara.tv/profile/lumymmd', + 'info_dict': { + 'id': 'lumymmd', + 'title': 'Lumy MMD', + }, + 'playlist_mincount': 1, }] def _entries(self, playlist_id, user_id, page): @@ -225,7 +246,7 @@ def _entries(self, playlist_id, user_id, page): 'sort': 'date', 'user': user_id, 'limit': self._PER_PAGE, - }) + }, headers=self._get_media_token()) for x in traverse_obj(videos, ('results', ..., 'id')): yield self.url_result(f'https://iwara.tv/video/{x}') @@ -244,7 +265,6 @@ def _real_extract(self, url): class IwaraPlaylistIE(IwaraBaseIE): - # the ID is an UUID but I don't think it's necessary to write concrete regex _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P<id>[0-9a-f-]+)' IE_NAME = 'iwara:playlist' _PER_PAGE = 32 @@ -260,7 +280,8 @@ class IwaraPlaylistIE(IwaraBaseIE): def _entries(self, playlist_id, first_page, page): videos = self._download_json( 'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}', - query={'page': page, 'limit': self._PER_PAGE}) if page else first_page + query={'page': page, 'limit': self._PER_PAGE}, + headers=self._get_media_token()) if page else first_page for x in traverse_obj(videos, ('results', ..., 'id')): yield self.url_result(f'https://iwara.tv/video/{x}') @@ -268,7 +289,7 @@ def _real_extract(self, url): playlist_id = self._match_id(url) page_0 = self._download_json( f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id, - note='Requesting playlist info') + note='Requesting playlist info', headers=self._get_media_token()) return self.playlist_result( OnDemandPagedList( From ff9b0e071ffae5543cc309e6f9e647ac51e5846e Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 14 Jun 2023 19:08:46 +0530 Subject: [PATCH 190/501] [extractor/youtube] Determine audio language using automatic captions --- yt_dlp/extractor/youtube.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 47ad1da76c..606f24d04d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4312,9 +4312,13 @@ def process_language(container, base_url, lang_code, sub_name, query): continue trans_code += f'-{lang_code}' trans_name += format_field(lang_name, None, ' from %s') - # Add an "-orig" label to the original language so that it can be distinguished. - # The subs are returned without "-orig" as well for compatibility if lang_code == f'a-{orig_trans_code}': + # Set audio language based on original subtitles + for f in formats: + if f.get('acodec') != 'none' and not f.get('language'): + f['language'] = orig_trans_code + # Add an "-orig" label to the original language so that it can be distinguished. + # The subs are returned without "-orig" as well for compatibility process_language( automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) # Setting tlang=lang returns damaged subtitles. From 13ff78095372fd98900a32572cf817994c07ccb5 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 14 Jun 2023 19:09:53 +0530 Subject: [PATCH 191/501] [postprocessor] Print newline for `--progress-template` Closes #7193 --- yt_dlp/postprocessor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index 537792b07f..08b0fe1ff9 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -187,7 +187,7 @@ def report_progress(self, s): tmpl = progress_template.get('postprocess') if tmpl: self._downloader.to_screen( - self._downloader.evaluate_outtmpl(tmpl, progress_dict), skip_eol=True, quiet=False) + self._downloader.evaluate_outtmpl(tmpl, progress_dict), quiet=False) self._downloader.to_console_title(self._downloader.evaluate_outtmpl( progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', From 01aba2519a0884ef17d5f85608dbd2a455577147 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 18 Jun 2023 04:04:52 +0530 Subject: [PATCH 192/501] [jsinterp] Fix global object extraction Closes #7327 --- test/test_youtube_signature.py | 7 ++++++- yt_dlp/jsinterp.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 01f09de88c..6759d2c467 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -62,6 +62,11 @@ 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', + ), + ( + 'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ) ] @@ -230,7 +235,7 @@ def n_sig(jscode, sig_input): make_sig_test = t_factory( - 'signature', signature, re.compile(r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) + 'signature', signature, re.compile(r'.*(?:-|/player/)(?P<id>[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) for test_spec in _SIG_TESTS: make_sig_test(*test_spec) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index d6d555733a..9c280fb86f 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -779,7 +779,7 @@ def extract_object(self, objname): obj = {} obj_m = re.search( r'''(?x) - (?<!this\.)%s\s*=\s*{\s* + (?<!\.)%s\s*=\s*{\s* (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), From 81c8b9bdd9841b72cbfc1bbff9dab5fb4aa038b0 Mon Sep 17 00:00:00 2001 From: garret <76261416+garret1317@users.noreply.github.com> Date: Mon, 19 Jun 2023 14:25:27 +0100 Subject: [PATCH 193/501] [extractor/nhk] `NhkRadiruLive`: Add extractor (#7332) Authored by: garret1317 --- README.md | 3 ++ yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/nhk.py | 75 ++++++++++++++++++++++++++++++++- 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ce555c66f0..659730410b 100644 --- a/README.md +++ b/README.md @@ -1850,6 +1850,9 @@ ### wrestleuniverse #### twitchstream (Twitch) * `client_id`: Client ID value to be sent with GraphQL requests, e.g. `twitchstream:client_id=kimne78kx3ncx6brgo4mv6wki5h1ko` +#### nhkradirulive (NHK らじる★らじる LIVE) +* `area`: Which regional variation to extract. Valid areas are: `sapporo`, `sendai`, `tokyo`, `nagoya`, `osaka`, `hiroshima`, `matsuyama`, `fukuoka`. Defaults to `tokyo` + **Note**: These options may be changed/removed in the future without concern for backward compatibility <!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE --> diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 10e132b4b8..394f3c29d3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1260,6 +1260,7 @@ NhkForSchoolProgramListIE, NhkRadioNewsPageIE, NhkRadiruIE, + NhkRadiruLiveIE, ) from .nhl import NHLIE from .nick import ( diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index a3efa326a1..fbd6a18f6d 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -2,12 +2,15 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, + int_or_none, + join_nonempty, parse_duration, traverse_obj, unescapeHTML, unified_timestamp, + url_or_none, urljoin, - url_or_none ) @@ -492,3 +495,73 @@ class NhkRadioNewsPageIE(InfoExtractor): def _real_extract(self, url): return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE) + + +class NhkRadiruLiveIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)' + _TESTS = [{ + # radio 1, no area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1', + 'info_dict': { + 'id': 'r1-tokyo', + 'title': 're:^NHKネットラジオ第1 東京.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png', + 'live_status': 'is_live', + }, + }, { + # radio 2, area specified + # (the area doesnt actually matter, r2 is national) + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}}, + 'info_dict': { + 'id': 'r2-fukuoka', + 'title': 're:^NHKネットラジオ第2 福岡.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png', + 'live_status': 'is_live', + }, + }, { + # fm, area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}}, + 'info_dict': { + 'id': 'fm-sapporo', + 'title': 're:^NHKネットラジオFM 札幌.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png', + 'live_status': 'is_live', + } + }] + + _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'} + + def _real_extract(self, url): + station = self._match_id(url) + area = self._configuration_arg('area', ['tokyo'])[0] + + config = self._download_xml( + 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information') + data = config.find(f'.//data//area[.="{area}"]/..') + + if not data: + raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join( + [i.text for i in config.findall('.//data//area')]), expected=True) + + noa_info = self._download_json( + f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text), + station, note=f'Downloading {area} station metadata') + present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present')) + + return { + 'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))), + 'id': join_nonempty(station, area), + 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + })), + 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station), + 'is_live': True, + } From 6f69101dc912690338d32e2aab085c32e44eba3f Mon Sep 17 00:00:00 2001 From: Vladislav <117850688+7vlad7@users.noreply.github.com> Date: Mon, 19 Jun 2023 23:43:35 +0300 Subject: [PATCH 194/501] [extractor/yappy] YappyProfile: Add extractor (#7346) Authored by: 7vlad7 --- yt_dlp/extractor/_extractors.py | 5 ++++- yt_dlp/extractor/yappy.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 394f3c29d3..7e1fa4a0d1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2427,7 +2427,10 @@ ZenYandexChannelIE, ) from .yapfiles import YapFilesIE -from .yappy import YappyIE +from .yappy import ( + YappyIE, + YappyProfileIE, +) from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .yle_areena import YleAreenaIE diff --git a/yt_dlp/extractor/yappy.py b/yt_dlp/extractor/yappy.py index f168bdbf9a..7b3d0cb81f 100644 --- a/yt_dlp/extractor/yappy.py +++ b/yt_dlp/extractor/yappy.py @@ -1,9 +1,10 @@ from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, int_or_none, traverse_obj, unified_timestamp, - url_or_none + url_or_none, ) @@ -97,3 +98,30 @@ def _real_extract(self, url): 'categories': traverse_obj(media_data, ('categories', ..., 'name')) or None, 'repost_count': int_or_none(media_data.get('sharingCount')) } + + +class YappyProfileIE(InfoExtractor): + _VALID_URL = r'https?://yappy\.media/profile/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://yappy.media/profile/59a0c8c485e5410b9c43474bf4c6a373', + 'info_dict': { + 'id': '59a0c8c485e5410b9c43474bf4c6a373', + }, + 'playlist_mincount': 527, + }] + + def _real_extract(self, url): + profile_id = self._match_id(url) + + def fetch_page(page_num): + page_num += 1 + videos = self._download_json( + f'https://yappy.media/api/video/list/{profile_id}?page={page_num}', + profile_id, f'Downloading profile page {page_num} JSON') + + for video in traverse_obj(videos, ('results', lambda _, v: v['uuid'])): + yield self.url_result( + f'https://yappy.media/video/{video["uuid"]}', YappyIE, + video['uuid'], video.get('description')) + + return self.playlist_result(OnDemandPagedList(fetch_page, 15), profile_id) From 5cc09c004bd5edbbada9b041c08a720cadc4f4df Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Tue, 20 Jun 2023 12:22:36 +0800 Subject: [PATCH 195/501] [extractor/zaiko] ZaikoETicket: Add extractor (#7347) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/zaiko.py | 70 +++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7e1fa4a0d1..ff659a7a29 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2448,7 +2448,10 @@ from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE -from .zaiko import ZaikoIE +from .zaiko import ( + ZaikoIE, + ZaikoETicketIE, +) from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index 59fc64c5a9..84cee4445e 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -1,3 +1,5 @@ +import base64 + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -5,12 +7,33 @@ int_or_none, str_or_none, traverse_obj, + try_call, unescapeHTML, url_or_none, ) -class ZaikoIE(InfoExtractor): +class ZaikoBaseIE(InfoExtractor): + def _download_real_webpage(self, url, video_id): + webpage, urlh = self._download_webpage_handle(url, video_id) + final_url = urlh.geturl() + if 'zaiko.io/login' in final_url: + self.raise_login_required() + elif '/_buy/' in final_url: + raise ExtractorError('Your account does not have tickets to this event', expected=True) + return webpage + + def _parse_vue_element_attr(self, name, string, video_id): + page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name) + attrs = {} + for key, value in extract_attributes(page_elem).items(): + if key.startswith(':'): + attrs[key[1:]] = self._parse_json( + value, video_id, transform_source=unescapeHTML, fatal=False) + return attrs + + +class ZaikoIE(ZaikoBaseIE): _VALID_URL = r'https?://(?:[\w-]+\.)?zaiko\.io/event/(?P<id>\d+)/stream(?:/\d+)+' _TESTS = [{ 'url': 'https://zaiko.io/event/324868/stream/20571/20571', @@ -30,24 +53,10 @@ class ZaikoIE(InfoExtractor): 'params': {'skip_download': 'm3u8'}, }] - def _parse_vue_element_attr(self, name, string, video_id): - page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name) - attrs = {} - for key, value in extract_attributes(page_elem).items(): - if key.startswith(':'): - attrs[key[1:]] = self._parse_json( - value, video_id, transform_source=unescapeHTML, fatal=False) - return attrs - def _real_extract(self, url): video_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, video_id) - final_url = urlh.geturl() - if 'zaiko.io/login' in final_url: - self.raise_login_required() - elif '/_buy/' in final_url: - raise ExtractorError('Your account does not have tickets to this event', expected=True) + webpage = self._download_real_webpage(url, video_id) stream_meta = self._parse_vue_element_attr('stream-page', webpage, video_id) player_page = self._download_webpage( @@ -90,3 +99,32 @@ def _real_extract(self, url): 'thumbnail': ('poster_url', {url_or_none}), })), } + + +class ZaikoETicketIE(ZaikoBaseIE): + _VALID_URL = r'https?://(?:www.)?zaiko\.io/account/eticket/(?P<id>[\w=-]{49})' + _TESTS = [{ + 'url': 'https://zaiko.io/account/eticket/TZjMwMzQ2Y2EzMXwyMDIzMDYwNzEyMTMyNXw1MDViOWU2Mw==', + 'playlist_count': 1, + 'info_dict': { + 'id': 'f30346ca31-20230607121325-505b9e63', + 'title': 'ZAIKO STREAMING TEST', + 'thumbnail': 'https://media.zkocdn.net/pf_1/1_3wdyjcjyupseatkwid34u', + }, + 'skip': 'Only available with the ticketholding account', + }] + + def _real_extract(self, url): + ticket_id = self._match_id(url) + ticket_id = try_call( + lambda: base64.urlsafe_b64decode(ticket_id[1:]).decode().replace('|', '-')) or ticket_id + + webpage = self._download_real_webpage(url, ticket_id) + eticket = self._parse_vue_element_attr('eticket', webpage, ticket_id) + + return self.playlist_result( + [self.url_result(stream, ZaikoIE) for stream in traverse_obj(eticket, ('streams', ..., 'url'))], + ticket_id, **traverse_obj(eticket, ('ticket-details', { + 'title': 'event_name', + 'thumbnail': 'event_img_url', + }))) From eedda5252c05327748dede204a8fccafa0288118 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 19 Jun 2023 14:06:39 +0530 Subject: [PATCH 196/501] [utils] `FormatSorter`: Improve `size` and `br` Closes #1596 Previously, when some formats have accurate size and some approximate, the ones with accurate size was always prioritized For formats with known tbr and unknown vbr/abr, we were setting (vbr=tbr, abr=0) for sorting to work. This is no longer needed. Authored by pukkandan, u-spec-png --- test/test_InfoExtractor.py | 10 ---------- yt_dlp/utils/_utils.py | 25 +++++++++++++++---------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 1f60abfd25..b7dee496af 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -917,8 +917,6 @@ def test_parse_m3u8_formats(self): 'acodec': 'mp4a.40.2', 'video_ext': 'mp4', 'audio_ext': 'none', - 'vbr': 263.851, - 'abr': 0, }, { 'format_id': '577', 'format_index': None, @@ -936,8 +934,6 @@ def test_parse_m3u8_formats(self): 'acodec': 'mp4a.40.2', 'video_ext': 'mp4', 'audio_ext': 'none', - 'vbr': 577.61, - 'abr': 0, }, { 'format_id': '915', 'format_index': None, @@ -955,8 +951,6 @@ def test_parse_m3u8_formats(self): 'acodec': 'mp4a.40.2', 'video_ext': 'mp4', 'audio_ext': 'none', - 'vbr': 915.905, - 'abr': 0, }, { 'format_id': '1030', 'format_index': None, @@ -974,8 +968,6 @@ def test_parse_m3u8_formats(self): 'acodec': 'mp4a.40.2', 'video_ext': 'mp4', 'audio_ext': 'none', - 'vbr': 1030.138, - 'abr': 0, }, { 'format_id': '1924', 'format_index': None, @@ -993,8 +985,6 @@ def test_parse_m3u8_formats(self): 'acodec': 'mp4a.40.2', 'video_ext': 'mp4', 'audio_ext': 'none', - 'vbr': 1924.009, - 'abr': 0, }], { 'en': [{ diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 6462101165..1fd6f44af4 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5669,6 +5669,7 @@ def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None) return orderedSet(requested) +# TODO: Rewrite class FormatSorter: regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' @@ -5717,8 +5718,10 @@ class FormatSorter: 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, - 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, - 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, + 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), + 'function': lambda it: next(filter(None, it), None)}, + 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), + 'function': lambda it: next(filter(None, it), None)}, 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, @@ -5949,13 +5952,15 @@ def calculate_preference(self, format): format['preference'] = -100 # Determine missing bitrates - if format.get('tbr') is None: - if format.get('vbr') is not None and format.get('abr') is not None: - format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) - else: - if format.get('vcodec') != 'none' and format.get('vbr') is None: - format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != 'none' and format.get('abr') is None: - format['abr'] = format.get('tbr') - format.get('vbr', 0) + if format.get('vcodec') == 'none': + format['vbr'] = 0 + if format.get('acodec') == 'none': + format['abr'] = 0 + if not format.get('vbr') and format.get('vcodec') != 'none': + format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None + if not format.get('abr') and format.get('acodec') != 'none': + format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None + if not format.get('tbr'): + format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None return tuple(self._calculate_field_preference(format, field) for field in self._order) From 51a07b0dca4c079d58311c19b6d1c097c24bb021 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 19 Jun 2023 14:09:26 +0530 Subject: [PATCH 197/501] [extractor/youtube] Prioritize premium formats Closes #7283 --- yt_dlp/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 606f24d04d..4cbf0115c3 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3848,6 +3848,7 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -3855,15 +3856,15 @@ def build_fragments(f): 'format_note': join_nonempty( join_nonempty(audio_track.get('displayName'), language_preference > 0 and ' (default)', delim=''), - fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), - fmt.get('isDrc') and 'DRC', + name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 - 'source_preference': -10 if throttled else -5 if itag == '22' else -1, + 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1) + + (100 if 'Premium' in name else 0)), 'fps': int_or_none(fmt.get('fps')) or None, 'audio_channels': fmt.get('audioChannels'), 'height': height, From 2e023649ea4e11151545a34dc1360c114981a236 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 19 Jun 2023 14:45:59 +0530 Subject: [PATCH 198/501] [cookies] Revert compatibility breakage in b38d4c941d1993ab27e4c0f8e024e23c2ec0f8f8 --- README.md | 2 +- yt_dlp/cookies.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 659730410b..c39e151d47 100644 --- a/README.md +++ b/README.md @@ -728,7 +728,7 @@ ## Filesystem Options: By default, all containers of the most recently accessed profile are used. Currently supported keyrings are: basictext, - gnomekeyring, kwallet + gnomekeyring, kwallet, kwallet5, kwallet6 --no-cookies-from-browser Do not load cookies from browser (default) --cache-dir DIR Location in the filesystem where yt-dlp can store some downloaded information (such as diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index e46d193416..8693e0b4ad 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -705,11 +705,11 @@ class _LinuxKeyring(Enum): https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/key_storage_util_linux.h SelectedLinuxBackend """ - KWALLET4 = auto() # this value is just called KWALLET in the chromium source but it is for KDE4 only + KWALLET = auto() # KDE4 KWALLET5 = auto() KWALLET6 = auto() - GNOME_KEYRING = auto() - BASIC_TEXT = auto() + GNOMEKEYRING = auto() + BASICTEXT = auto() SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() @@ -803,7 +803,7 @@ def _choose_linux_keyring(logger): desktop_environment = _get_linux_desktop_environment(os.environ, logger) logger.debug(f'detected desktop environment: {desktop_environment.name}') if desktop_environment == _LinuxDesktopEnvironment.KDE4: - linux_keyring = _LinuxKeyring.KWALLET4 + linux_keyring = _LinuxKeyring.KWALLET elif desktop_environment == _LinuxDesktopEnvironment.KDE5: linux_keyring = _LinuxKeyring.KWALLET5 elif desktop_environment == _LinuxDesktopEnvironment.KDE6: @@ -811,9 +811,9 @@ def _choose_linux_keyring(logger): elif desktop_environment in ( _LinuxDesktopEnvironment.KDE3, _LinuxDesktopEnvironment.LXQT, _LinuxDesktopEnvironment.OTHER ): - linux_keyring = _LinuxKeyring.BASIC_TEXT + linux_keyring = _LinuxKeyring.BASICTEXT else: - linux_keyring = _LinuxKeyring.GNOME_KEYRING + linux_keyring = _LinuxKeyring.GNOMEKEYRING return linux_keyring @@ -828,7 +828,7 @@ def _get_kwallet_network_wallet(keyring, logger): """ default_wallet = 'kdewallet' try: - if keyring == _LinuxKeyring.KWALLET4: + if keyring == _LinuxKeyring.KWALLET: service_name = 'org.kde.kwalletd' wallet_path = '/modules/kwalletd' elif keyring == _LinuxKeyring.KWALLET5: @@ -929,11 +929,11 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger) logger.debug(f'Chosen keyring: {keyring.name}') - if keyring in (_LinuxKeyring.KWALLET4, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6): + if keyring in (_LinuxKeyring.KWALLET, _LinuxKeyring.KWALLET5, _LinuxKeyring.KWALLET6): return _get_kwallet_password(browser_keyring_name, keyring, logger) - elif keyring == _LinuxKeyring.GNOME_KEYRING: + elif keyring == _LinuxKeyring.GNOMEKEYRING: return _get_gnome_keyring_password(browser_keyring_name, logger) - elif keyring == _LinuxKeyring.BASIC_TEXT: + elif keyring == _LinuxKeyring.BASICTEXT: # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required) return None assert False, f'Unknown keyring {keyring}' From 97afb093d4cbe5df889145afa5f9ede4535e93e4 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 05:11:14 +0530 Subject: [PATCH 199/501] [extractor/youtube] Ignore wrong fps of some formats --- yt_dlp/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4cbf0115c3..4daa4f50e9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3849,6 +3849,7 @@ def build_fragments(f): client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' + fps = int_or_none(fmt.get('fps')) or 0 dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -3865,7 +3866,7 @@ def build_fragments(f): # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1) + (100 if 'Premium' in name else 0)), - 'fps': int_or_none(fmt.get('fps')) or None, + 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 'audio_channels': fmt.get('audioChannels'), 'height': height, 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, @@ -3936,6 +3937,8 @@ def process_manifest_format(f, proto, client_name, itag): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) if self.get_param('verbose'): f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + if f.get('fps') and f['fps'] <= 1: + del f['fps'] return True subtitles = {} From 93b39cdbd9dcf351bfa0c4ee252805b4617fdca9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 04:31:39 +0530 Subject: [PATCH 200/501] Add `--compat-option playlist-match-filter` Closes #6073 --- README.md | 7 ++++--- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/options.py | 8 ++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c39e151d47..578f84956d 100644 --- a/README.md +++ b/README.md @@ -157,14 +157,15 @@ ### Differences in default behavior * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is +* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options (Do NOT use) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect` +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options no-external-downloader-progress`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options # INSTALLATION diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index b4923920fc..077a37b305 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1403,7 +1403,7 @@ def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False): def _match_entry(self, info_dict, incomplete=False, silent=False): """Returns None if the file should be downloaded""" - _type = info_dict.get('_type', 'video') + _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video') assert incomplete or _type == 'video', 'Only video result can be considered complete' video_title = info_dict.get('title', info_dict.get('id', 'entry')) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index fecc274031..1c8d73f16e 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -467,15 +467,15 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): callback_kwargs={ 'allowed_values': { 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', - 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', + 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', 'playlist-match-filter', 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', }, 'aliases': { - 'youtube-dl': ['all', '-multistreams'], - 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], + 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': ['no-external-downloader-progress'], + '2022': ['no-external-downloader-progress', 'playlist-match-filter'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' From a35af4306d24c56c6358f89cdf204860d1cd62b4 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 05:18:03 +0530 Subject: [PATCH 201/501] [utils] `strftime_or_none`: Handle negative timestamps Closes #6706 Authored by pukkandan, dirkf --- yt_dlp/utils/_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 1fd6f44af4..256e2db5a9 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2452,7 +2452,10 @@ def strftime_or_none(timestamp, date_format, default=None): if isinstance(timestamp, (int, float)): # unix timestamp # Using naive datetime here can break timestamp() in Windows # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414 - datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) + # Also, datetime.datetime.fromtimestamp breaks for negative timestamps + # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642 + datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc) + + datetime.timedelta(seconds=timestamp)) elif isinstance(timestamp, str): # assume YYYYMMDD datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') date_format = re.sub( # Support %s on windows From ebe1b4e34f43c3acad30e4bcb8484681a030c114 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 05:15:03 +0530 Subject: [PATCH 202/501] [outtmpl] Fix some minor bugs Closes #7164 --- test/test_YoutubeDL.py | 2 +- yt_dlp/YoutubeDL.py | 10 +++++----- yt_dlp/utils/_utils.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ee6c527135..ccc9e36f34 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -755,7 +755,7 @@ def expect_same_infodict(out): test('%(id)d %(id)r', "1234 '1234'") test('%(id)r %(height)r', "'1234' 1080") test('%(ext)s-%(ext|def)d', 'mp4-def') - test('%(width|0)04d', '0000') + test('%(width|0)04d', '0') test('a%(width|b)d', 'ab', outtmpl_na_placeholder='none') FORMATS = self.outtmpl_info['formats'] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 077a37b305..a546ce65ba 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1286,17 +1286,17 @@ def create_key(outer_mobj): if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = f'0{field_size_compat_map[key]:d}d' - if value is None: - value = default - elif replacement is not None: + if None not in (value, replacement): try: value = replacement_formatter.format(replacement, value) except ValueError: - value = na + value, default = None, na flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' - if fmt[-1] == 'l': # list + if value is None: + value, fmt = default, 's' + elif fmt[-1] == 'l': # list delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 256e2db5a9..d10d621d54 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -3302,7 +3302,7 @@ def q(qid): ''' -STR_FORMAT_TYPES = 'diouxXeEfFgGcrs' +STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa' def limit_length(s, length): From 424f3bf03305088df6e01d62f7311be8601ad3f4 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 02:43:10 +0530 Subject: [PATCH 203/501] [downloader/fragment] Do not sleep between fragments Closes #6599 --- yt_dlp/downloader/fragment.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 53b4b604e7..458167216c 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -173,6 +173,9 @@ def _prepare_frag_download(self, ctx): **self.params, 'noprogress': True, 'test': False, + 'sleep_interval': 0, + 'max_sleep_interval': 0, + 'sleep_interval_subtitles': 0, }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' From 02948a17d903f544363bb20b51a6d8baed7bba08 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 04:12:01 +0530 Subject: [PATCH 204/501] [update] Do not restart into versions without `--update-to` --- yt_dlp/update.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 6c9bdaf1c7..4790075eb6 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -149,7 +149,7 @@ def __init__(self, ydl, target=None): f'You are switching to an {self.ydl._format_err("unofficial", "red")} executable ' f'from {self.ydl._format_err(self._target_repo, self.ydl.Styles.EMPHASIS)}. ' f'Run {self.ydl._format_err("at your own risk", "light red")}') - self.restart = self._blocked_restart + self._block_restart('Automatically restarting into custom builds is disabled for security reasons') else: self._target_repo = UPDATE_SOURCES.get(self.target_channel) if not self._target_repo: @@ -294,6 +294,7 @@ def update(self): if (_VERSION_RE.fullmatch(self.target_tag[5:]) and version_tuple(self.target_tag[5:]) < (2023, 3, 2)): self.ydl.report_warning('You are downgrading to a version without --update-to') + self._block_restart('Cannot automatically restart to a version without --update-to') directory = os.path.dirname(self.filename) if not os.access(self.filename, os.W_OK): @@ -381,11 +382,11 @@ def restart(self): _, _, returncode = Popen.run(self.cmd) return returncode - def _blocked_restart(self): - self._report_error( - 'Automatically restarting into custom builds is disabled for security reasons. ' - 'Restart yt-dlp to use the updated version', expected=True) - return self.ydl._download_retcode + def _block_restart(self, msg): + def wrapper(): + self._report_error(f'{msg}. Restart yt-dlp to use the updated version', expected=True) + return self.ydl._download_retcode + self.restart = wrapper def run_update(ydl): From af7585c824a1e405bd8afa46d87b4be322edc93c Mon Sep 17 00:00:00 2001 From: MMM <flashdagger@googlemail.com> Date: Wed, 21 Jun 2023 04:44:12 +0200 Subject: [PATCH 205/501] [extractor/tagesschau] Fix single audio urls (#6626) Authored by: flashdagger --- yt_dlp/extractor/tagesschau.py | 58 +++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index ea0532c24e..e23b490b00 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -2,10 +2,12 @@ from .common import InfoExtractor from ..utils import ( - js_to_json, + UnsupportedError, extract_attributes, - try_get, int_or_none, + js_to_json, + parse_iso8601, + try_get, ) @@ -14,36 +16,38 @@ class TagesschauIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '7a7287612fa881a1ae1d087df45c2fd6', + 'md5': 'ccb9359bf8c4795836e43759f3408a93', 'info_dict': { 'id': 'video-102143-1', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'duration': 138, }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', - 'md5': '3c54c1f6243d279b706bde660ceec633', + 'md5': '5c15e8f3da049e48829ec9786d835536', 'info_dict': { 'id': 'ts-5727-1', 'ext': 'mp4', 'title': 'Ganze Sendung', + 'duration': 932, }, }, { # exclusive audio 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', - 'md5': '4cf22023c285f35e99c24d290ba58cc9', + 'md5': '4bff8f23504df56a0d86ed312d654182', 'info_dict': { 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', }, }, { 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', + 'md5': 'f049fa1698d7564e9ca4c3325108f034', 'info_dict': { 'id': 'bnd-303-1', - 'ext': 'mp4', - 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', + 'ext': 'mp3', + 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa', }, }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', @@ -51,13 +55,24 @@ class TagesschauIE(InfoExtractor): 'id': 'afd-parteitag-135', 'title': 'AfD', }, - 'playlist_count': 20, + 'playlist_mincount': 15, }, { 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', 'info_dict': { 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html', + 'info_dict': { + 'id': 'podcast-11km-327', + 'ext': 'mp3', + 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen', + 'upload_date': '20230322', + 'timestamp': 1679482808, + 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg', + 'description': 'md5:dad059931fe4b3693e3656e93a249848', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', @@ -117,7 +132,7 @@ def _real_extract(self, url): formats = [] if media_url.endswith('master.m3u8'): formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') - elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): + elif media_url.endswith('.mp3'): formats = [{ 'url': media_url, 'vcodec': 'none', @@ -130,20 +145,19 @@ def _real_extract(self, url): 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), 'formats': formats }) + + if not entries: + raise UnsupportedError(url) + if len(entries) > 1: return self.playlist_result(entries, display_id, title) - formats = entries[0]['formats'] - video_info = self._search_json_ld(webpage, video_id) - description = video_info.get('description') - thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') - timestamp = video_info.get('timestamp') - title = title or video_info.get('description') return { 'id': display_id, 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'timestamp': timestamp, - 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': entries[0]['formats'], + 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)), + 'description': self._og_search_description(webpage), + 'duration': entries[0]['duration'], } From db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb Mon Sep 17 00:00:00 2001 From: Nicolai Dagestad <nicolai.github@dagestad.fr> Date: Wed, 21 Jun 2023 05:07:42 +0200 Subject: [PATCH 206/501] Add option `--netrc-cmd` (#6682) Authored by: NDagestad, pukkandan Closes #1706 --- README.md | 15 +++++++++-- yt_dlp/YoutubeDL.py | 1 + yt_dlp/__init__.py | 5 ++-- yt_dlp/extractor/common.py | 53 +++++++++++++++++++++----------------- yt_dlp/options.py | 4 +++ yt_dlp/utils/_utils.py | 8 ++++++ 6 files changed, 58 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 578f84956d..9a00da9035 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ * [Extractor Options](#extractor-options) * [CONFIGURATION](#configuration) * [Configuration file encoding](#configuration-file-encoding) - * [Authentication with .netrc file](#authentication-with-netrc-file) + * [Authentication with netrc](#authentication-with-netrc) * [Notes about environment variables](#notes-about-environment-variables) * [OUTPUT TEMPLATE](#output-template) * [Output template examples](#output-template-examples) @@ -910,6 +910,8 @@ ## Authentication Options: --netrc-location PATH Location of .netrc authentication data; either the path or its containing directory. Defaults to ~/.netrc + --netrc-cmd NETRC_CMD Command to execute to get the credentials + credentials for an extractor. --video-password PASSWORD Video password (vimeo, youku) --ap-mso MSO Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for @@ -1203,7 +1205,7 @@ ### Configuration file encoding If you want your file to be decoded differently, add `# coding: ENCODING` to the beginning of the file (e.g. `# coding: shift-jis`). There must be no characters before that, even spaces or BOM. -### Authentication with `.netrc` file +### Authentication with netrc You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per-extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: ``` @@ -1223,6 +1225,15 @@ ### Authentication with `.netrc` file The default location of the .netrc file is `~` (see below). +As an alternative to using the `.netrc` file, which has the disadvantage of keeping your passwords in a plain text file, you can configure a custom shell command to provide the credentials for an extractor. This is done by providing the `--netrc-cmd` parameter, it shall output the credentials in the netrc format and return `0` on success, other values will be treated as an error. `{}` in the command will be replaced by the name of the extractor to make it possible to select the credentials for the right extractor. +To use braces in the command, they need to be escaped by doubling them. (see example bellow) + +E.g. To use an encrypted `.netrc` file stored as `.authinfo.gpg` +``` +yt-dlp --netrc-cmd 'gpg --decrypt ~/.authinfo.gpg' https://www.youtube.com/watch?v=BaW_jenozKc +``` + + ### Notes about environment variables * Environment variables are normally specified as `${VARIABLE}`/`$VARIABLE` on UNIX and `%VARIABLE%` on Windows; but is always shown as `${VARIABLE}` in this documentation * yt-dlp also allow using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location` diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a546ce65ba..e51bceef34 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -190,6 +190,7 @@ class YoutubeDL: ap_password: Multiple-system operator account password. usenetrc: Use netrc for authentication instead. netrc_location: Location of the netrc file. Defaults to ~/.netrc. + netrc_cmd: Use a shell command to get credentials verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 137c9503f6..46edd88d3e 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -188,8 +188,8 @@ def validate_minmax(min_val, max_val, min_name, max_name=None): raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"') # Usernames and passwords - validate(not opts.usenetrc or (opts.username is None and opts.password is None), - '.netrc', msg='using {name} conflicts with giving username/password') + validate(sum(map(bool, (opts.usenetrc, opts.netrc_cmd, opts.username))) <= 1, '.netrc', + msg='{name}, netrc command and username/password are mutually exclusive options') validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing') validate(opts.ap_password is None or opts.ap_username is not None, 'TV Provider account username', msg='{name} missing') @@ -741,6 +741,7 @@ def parse_options(argv=None): return ParsedOptions(parser, opts, urls, { 'usenetrc': opts.usenetrc, 'netrc_location': opts.netrc_location, + 'netrc_cmd': opts.netrc_cmd, 'username': opts.username, 'password': opts.password, 'twofactor': opts.twofactor, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ca2164a5db..f11a673583 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -13,6 +13,7 @@ import os import random import re +import subprocess import sys import time import types @@ -34,6 +35,7 @@ GeoUtils, HEADRequest, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -70,6 +72,7 @@ smuggle_url, str_or_none, str_to_int, + netrc_from_content, strip_or_none, traverse_obj, truncate_string, @@ -535,7 +538,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -1291,45 +1294,47 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None netrc_machine = netrc_machine or self._NETRC_MACHINE - if self.get_param('usenetrc', False): - try: - netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') - if os.path.isdir(netrc_file): - netrc_file = os.path.join(netrc_file, '.netrc') - info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (OSError, netrc.NetrcParseError) as err: - self.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) + cmd = self.get_param('netrc_cmd', '').format(netrc_machine) + if cmd: + self.to_screen(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + raise OSError(f'Command returned error code {ret}') + info = netrc_from_content(stdout).authenticators(netrc_machine) - return username, password + elif self.get_param('usenetrc', False): + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(netrc_file).authenticators(netrc_machine) + + else: + return None, None + if not info: + raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}') + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) First look for the manually specified credentials using username_option and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. + are available try the netrc_cmd if it is defined or look in the + netrc file using the netrc_machine or _NETRC_MACHINE value. If there's no info available, return (None, None) """ - # Attempt to use provided username and password or .netrc data username = self.get_param(username_option) if username is not None: password = self.get_param(password_option) else: - username, password = self._get_netrc_login_info(netrc_machine) - + try: + username, password = self._get_netrc_login_info(netrc_machine) + except (OSError, netrc.NetrcParseError) as err: + self.report_warning(f'Failed to parse .netrc: {err}') + return None, None return username, password def _get_tfa_info(self, note='two-factor verification code'): diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 1c8d73f16e..b174a24af7 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -720,6 +720,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--netrc-location', dest='netrc_location', metavar='PATH', help='Location of .netrc authentication data; either the path or its containing directory. Defaults to ~/.netrc') + authentication.add_option( + '--netrc-cmd', + dest='netrc_cmd', metavar='NETRC_CMD', + help='Command to execute to get the credentials for an extractor.') authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index d10d621d54..28c2785cb0 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -25,6 +25,7 @@ import locale import math import mimetypes +import netrc import operator import os import platform @@ -864,6 +865,13 @@ def escapeHTML(text): ) +class netrc_from_content(netrc.netrc): + def __init__(self, content): + self.hosts, self.macros = {}, {} + with io.StringIO(content) as stream: + self._parse('-', stream, False) + + def process_communicate_or_kill(p, *args, **kwargs): deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed ' f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead') From ad54c9130e793ce433bf9da334fa80df9f3aee58 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 09:21:20 +0530 Subject: [PATCH 207/501] [cleanup] Misc Closes #6288, Closes #7197, Closes #7265, Closes #7353, Closes #5773 Authored by: mikf, freezboltz, pukkandan --- .github/workflows/potential-duplicates.yml | 2 +- README.md | 28 +++++---- devscripts/changelog_override.json | 27 +++++++++ devscripts/cli_to_api.py | 4 +- devscripts/make_changelog.py | 14 ++--- test/test_YoutubeDL.py | 16 +++--- test/test_jsinterp.py | 67 ++++++++++++---------- test/test_youtube_signature.py | 2 +- yt_dlp/YoutubeDL.py | 20 ++++--- yt_dlp/cookies.py | 4 ++ yt_dlp/downloader/common.py | 1 - yt_dlp/downloader/niconico.py | 4 +- yt_dlp/extractor/ciscowebex.py | 4 +- yt_dlp/extractor/common.py | 3 +- yt_dlp/extractor/dumpert.py | 0 yt_dlp/extractor/globalplayer.py | 0 yt_dlp/extractor/odnoklassniki.py | 6 +- yt_dlp/extractor/tvp.py | 4 +- yt_dlp/extractor/vidio.py | 2 +- yt_dlp/extractor/youtube.py | 10 ++-- yt_dlp/options.py | 6 +- yt_dlp/utils/_legacy.py | 6 +- yt_dlp/utils/_utils.py | 10 +--- 23 files changed, 138 insertions(+), 102 deletions(-) mode change 100755 => 100644 yt_dlp/extractor/dumpert.py mode change 100755 => 100644 yt_dlp/extractor/globalplayer.py diff --git a/.github/workflows/potential-duplicates.yml b/.github/workflows/potential-duplicates.yml index 1521ae20c0..cfc5831864 100644 --- a/.github/workflows/potential-duplicates.yml +++ b/.github/workflows/potential-duplicates.yml @@ -12,7 +12,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} label: potential-duplicate state: all - threshold: 0.7 + threshold: 0.3 comment: | This issue is potentially a duplicate of one of the following issues: {{#issues}} diff --git a/README.md b/README.md index 9a00da9035..d9a5e6cefc 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ ### Differences in default behavior * The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead -* Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this +* Some internal metadata such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior @@ -251,7 +251,7 @@ #### Misc ``` <!-- MANPAGE: END EXCLUDED SECTION --> -**Note**: The manpages, shell completion files etc. are available inside the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) +**Note**: The manpages, shell completion (autocomplete) files etc. are available inside the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) ## DEPENDENCIES Python versions 3.7+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. @@ -699,9 +699,8 @@ ## Filesystem Options: --write-description etc. (default) --no-write-playlist-metafiles Do not write playlist metadata when using --write-info-json, --write-description etc. - --clean-info-json Remove some private fields such as filenames - from the infojson. Note that it could still - contain some personal information (default) + --clean-info-json Remove some internal metadata such as + filenames from the infojson (default) --no-clean-info-json Write all fields to the infojson --write-comments Retrieve video comments to be placed in the infojson. The comments are fetched even @@ -1041,13 +1040,10 @@ ## Post-Processing Options: that of --use-postprocessor (default: after_move). Same syntax as the output template can be used to pass any field as - arguments to the command. After download, an - additional field "filepath" that contains - the final path of the downloaded file is - also available, and if no fields are passed, - %(filepath,_filename|)q is appended to the - end of the command. This option can be used - multiple times + arguments to the command. If no fields are + passed, %(filepath,_filename|)q is appended + to the end of the command. This option can + be used multiple times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format (currently supported: ass, lrc, srt, vtt) @@ -1225,8 +1221,7 @@ ### Authentication with netrc The default location of the .netrc file is `~` (see below). -As an alternative to using the `.netrc` file, which has the disadvantage of keeping your passwords in a plain text file, you can configure a custom shell command to provide the credentials for an extractor. This is done by providing the `--netrc-cmd` parameter, it shall output the credentials in the netrc format and return `0` on success, other values will be treated as an error. `{}` in the command will be replaced by the name of the extractor to make it possible to select the credentials for the right extractor. -To use braces in the command, they need to be escaped by doubling them. (see example bellow) +As an alternative to using the `.netrc` file, which has the disadvantage of keeping your passwords in a plain text file, you can configure a custom shell command to provide the credentials for an extractor. This is done by providing the `--netrc-cmd` parameter, it shall output the credentials in the netrc format and return `0` on success, other values will be treated as an error. `{}` in the command will be replaced by the name of the extractor to make it possible to select the credentials for the right extractor (To use literal braces, double them like `{{}}`). E.g. To use an encrypted `.netrc` file stored as `.authinfo.gpg` ``` @@ -1389,7 +1384,10 @@ # OUTPUT TEMPLATE - `subtitles_table` (table): The subtitle format table as printed by `--list-subs` - `automatic_captions_table` (table): The automatic subtitle format table as printed by `--list-subs` + Available only after the video is downloaded (`post_process`/`after_move`): + - `filepath`: Actual path of downloaded video file + Available only in `--sponsorblock-chapter-title`: - `start_time` (numeric): Start time of the chapter in seconds @@ -1435,7 +1433,7 @@ # Download YouTube playlist videos in separate directories according to their up $ yt-dlp -o "%(upload_date>%Y)s/%(title)s.%(ext)s" "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" # Prefix playlist index with " - " separator, but only if it is available -$ yt-dlp -o '%(playlist_index|)s%(playlist_index& - |)s%(title)s.%(ext)s' BaW_jenozKc "https://www.youtube.com/user/TheLinuxFoundation/playlists" +$ yt-dlp -o "%(playlist_index&{} - |)s%(title)s.%(ext)s" BaW_jenozKc "https://www.youtube.com/user/TheLinuxFoundation/playlists" # Download all playlists of YouTube channel/user keeping each playlist in separate directory: $ yt-dlp -o "%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s" "https://www.youtube.com/user/TheLinuxFoundation/playlists" diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index e5c9d1aa21..73225bdb90 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -8,5 +8,32 @@ "action": "add", "when": "776d1c3f0c9b00399896dd2e40e78e9a43218109", "short": "[priority] **YouTube throttling fixes!**" + }, + { + "action": "remove", + "when": "2e023649ea4e11151545a34dc1360c114981a236" + }, + { + "action": "add", + "when": "01aba2519a0884ef17d5f85608dbd2a455577147", + "short": "[priority] YouTube: Improved throttling and signature fixes" + }, + { + "action": "change", + "when": "c86e433c35fe5da6cb29f3539eef97497f84ed38", + "short": "[extractor/niconico:series] Fix extraction (#6898)", + "authors": ["sqrtNOT"] + }, + { + "action": "change", + "when": "69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2", + "short": "[extractor/youtube:music_search_url] Extract title (#7102)", + "authors": ["kangalio"] + }, + { + "action": "change", + "when": "8417f26b8a819cd7ffcd4e000ca3e45033e670fb", + "short": "Add option `--color` (#6904)", + "authors": ["Grub4K"] } ] diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py index b8b7cbcf1d..2aa51eb6e9 100644 --- a/devscripts/cli_to_api.py +++ b/devscripts/cli_to_api.py @@ -19,11 +19,11 @@ def parse_patched_options(opts): 'extract_flat': False, 'concat_playlist': 'never', }) - yt_dlp.options.__dict__['create_parser'] = lambda: patched_parser + yt_dlp.options.create_parser = lambda: patched_parser try: return yt_dlp.parse_options(opts) finally: - yt_dlp.options.__dict__['create_parser'] = create_parser + yt_dlp.options.create_parser = create_parser default_opts = parse_patched_options([]).ydl_opts diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 1b7e251ee9..2fcdc06d77 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -44,7 +44,7 @@ def commit_lookup(cls): return { name: group for group, names in { - cls.PRIORITY: {''}, + cls.PRIORITY: {'priority'}, cls.CORE: { 'aes', 'cache', @@ -68,7 +68,7 @@ def commit_lookup(cls): 'misc', 'test', }, - cls.EXTRACTOR: {'extractor', 'extractors'}, + cls.EXTRACTOR: {'extractor'}, cls.DOWNLOADER: {'downloader'}, cls.POSTPROCESSOR: {'postprocessor'}, }.items() @@ -323,7 +323,7 @@ def apply_overrides(self, overrides): logger.debug(f'Ignored {when!r}, not in commits {self._start!r}') continue - override_hash = override.get('hash') + override_hash = override.get('hash') or when if override['action'] == 'add': commit = Commit(override.get('hash'), override['short'], override.get('authors') or []) logger.info(f'ADD {commit}') @@ -337,7 +337,7 @@ def apply_overrides(self, overrides): elif override['action'] == 'change': if override_hash not in self._commits: continue - commit = Commit(override_hash, override['short'], override['authors']) + commit = Commit(override_hash, override['short'], override.get('authors') or []) logger.info(f'CHANGE {self._commits[commit.hash]} -> {commit}') self._commits[commit.hash] = commit @@ -348,7 +348,7 @@ def groups(self): for commit in self: upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) if upstream_re: - commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}' + commit.short = f'[core/upstream] Merged with youtube-dl {upstream_re.group(1)}' match = self.MESSAGE_RE.fullmatch(commit.short) if not match: @@ -394,10 +394,10 @@ def details_from_prefix(prefix): return CommitGroup.CORE, None, () prefix, _, details = prefix.partition('/') - prefix = prefix.strip().lower() + prefix = prefix.strip() details = details.strip() - group = CommitGroup.get(prefix) + group = CommitGroup.get(prefix.lower()) if group is CommitGroup.PRIORITY: prefix, _, details = details.partition('/') diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ccc9e36f34..05dd3ed412 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -668,7 +668,7 @@ def test(tmpl, expected, *, info=None, **params): for (name, got), expect in zip((('outtmpl', out), ('filename', fname)), expected): if callable(expect): self.assertTrue(expect(got), f'Wrong {name} from {tmpl}') - else: + elif expect is not None: self.assertEqual(got, expect, f'Wrong {name} from {tmpl}') # Side-effects @@ -759,15 +759,17 @@ def expect_same_infodict(out): test('a%(width|b)d', 'ab', outtmpl_na_placeholder='none') FORMATS = self.outtmpl_info['formats'] - sanitize = lambda x: x.replace(':', ':').replace('"', """).replace('\n', ' ') # Custom type casting test('%(formats.:.id)l', 'id 1, id 2, id 3') test('%(formats.:.id)#l', ('id 1\nid 2\nid 3', 'id 1 id 2 id 3')) test('%(ext)l', 'mp4') test('%(formats.:.id) 18l', ' id 1, id 2, id 3') - test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS)))) - test('%(formats)#j', (json.dumps(FORMATS, indent=4), sanitize(json.dumps(FORMATS, indent=4)))) + test('%(formats)j', (json.dumps(FORMATS), None)) + test('%(formats)#j', ( + json.dumps(FORMATS, indent=4), + json.dumps(FORMATS, indent=4).replace(':', ':').replace('"', """).replace('\n', ' ') + )) test('%(title5).3B', 'á') test('%(title5)U', 'áéí 𝐀') test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') @@ -792,8 +794,8 @@ def expect_same_infodict(out): test('%(title|%)s %(title|%%)s', '% %%') test('%(id+1-height+3)05d', '00158') test('%(width+100)05d', 'NA') - test('%(formats.0) 15s', ('% 15s' % FORMATS[0], '% 15s' % sanitize(str(FORMATS[0])))) - test('%(formats.0)r', (repr(FORMATS[0]), sanitize(repr(FORMATS[0])))) + test('%(formats.0) 15s', ('% 15s' % FORMATS[0], None)) + test('%(formats.0)r', (repr(FORMATS[0]), None)) test('%(height.0)03d', '001') test('%(-height.0)04d', '-001') test('%(formats.-1.id)s', FORMATS[-1]['id']) @@ -805,7 +807,7 @@ def expect_same_infodict(out): out = json.dumps([{'id': f['id'], 'height.:2': str(f['height'])[:2]} if 'height' in f else {'id': f['id']} for f in FORMATS]) - test('%(formats.:.{id,height.:2})j', (out, sanitize(out))) + test('%(formats.:.{id,height.:2})j', (out, None)) test('%(formats.:.{id,height}.id)l', ', '.join(f['id'] for f in FORMATS)) test('%(.{id,title})j', ('{"id": "1234"}', '{"id": "1234"}')) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b01477e6ff..e9682ddab0 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -12,28 +12,38 @@ from yt_dlp.jsinterp import JS_Undefined, JSInterpreter +class NaN: + pass + + class TestJSInterpreter(unittest.TestCase): - def _test(self, code, ret, func='f', args=()): - self.assertEqual(JSInterpreter(code).call_function(func, *args), ret) + def _test(self, jsi_or_code, expected, func='f', args=()): + if isinstance(jsi_or_code, str): + jsi_or_code = JSInterpreter(jsi_or_code) + got = jsi_or_code.call_function(func, *args) + if expected is NaN: + self.assertTrue(math.isnan(got), f'{got} is not NaN') + else: + self.assertEqual(got, expected) def test_basic(self): jsi = JSInterpreter('function f(){;}') self.assertEqual(repr(jsi.extract_function('f')), 'F<f>') - self.assertEqual(jsi.call_function('f'), None) + self._test(jsi, None) self._test('function f(){return 42;}', 42) self._test('function f(){42}', None) self._test('var f = function(){return 42;}', 42) - def test_calc(self): - self._test('function f(a){return 2*a+1;}', 7, args=[3]) - def test_div(self): jsi = JSInterpreter('function f(a, b){return a / b;}') - self.assertTrue(math.isnan(jsi.call_function('f', 0, 0))) - self.assertTrue(math.isnan(jsi.call_function('f', JS_Undefined, 1))) - self.assertTrue(math.isinf(jsi.call_function('f', 2, 0))) - self.assertEqual(jsi.call_function('f', 0, 3), 0) + self._test(jsi, NaN, args=(0, 0)) + self._test(jsi, NaN, args=(JS_Undefined, 1)) + self._test(jsi, float('inf'), args=(2, 0)) + self._test(jsi, 0, args=(0, 3)) + + def test_calc(self): + self._test('function f(a){return 2*a+1;}', 7, args=[3]) def test_empty_return(self): self._test('function f(){return; y()}', None) @@ -102,16 +112,15 @@ def test_precedence(self): ''', [20, 20, 30, 40, 50]) def test_builtins(self): - jsi = JSInterpreter('function f() { return NaN }') - self.assertTrue(math.isnan(jsi.call_function('f'))) + self._test('function f() { return NaN }', NaN) def test_date(self): self._test('function f() { return new Date("Wednesday 31 December 1969 18:01:26 MDT") - 0; }', 86000) jsi = JSInterpreter('function f(dt) { return new Date(dt) - 0; }') - self.assertEqual(jsi.call_function('f', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) - self.assertEqual(jsi.call_function('f', '12/31/1969 18:01:26 MDT'), 86000) # m/d/y - self.assertEqual(jsi.call_function('f', '1 January 1970 00:00:00 UTC'), 0) + self._test(jsi, 86000, args=['Wednesday 31 December 1969 18:01:26 MDT']) + self._test(jsi, 86000, args=['12/31/1969 18:01:26 MDT']) # m/d/y + self._test(jsi, 0, args=['1 January 1970 00:00:00 UTC']) def test_call(self): jsi = JSInterpreter(''' @@ -119,8 +128,8 @@ def test_call(self): function y(a) { return x() + (a?a:0); } function z() { return y(3); } ''') - self.assertEqual(jsi.call_function('z'), 5) - self.assertEqual(jsi.call_function('y'), 2) + self._test(jsi, 5, func='z') + self._test(jsi, 2, func='y') def test_if(self): self._test(''' @@ -167,9 +176,9 @@ def test_switch(self): default:x=0; } return x } ''') - self.assertEqual(jsi.call_function('f', 1), 7) - self.assertEqual(jsi.call_function('f', 3), 6) - self.assertEqual(jsi.call_function('f', 5), 0) + self._test(jsi, 7, args=[1]) + self._test(jsi, 6, args=[3]) + self._test(jsi, 0, args=[5]) def test_switch_default(self): jsi = JSInterpreter(''' @@ -182,9 +191,9 @@ def test_switch_default(self): case 1: x+=1; } return x } ''') - self.assertEqual(jsi.call_function('f', 1), 2) - self.assertEqual(jsi.call_function('f', 5), 11) - self.assertEqual(jsi.call_function('f', 9), 14) + self._test(jsi, 2, args=[1]) + self._test(jsi, 11, args=[5]) + self._test(jsi, 14, args=[9]) def test_try(self): self._test('function f() { try{return 10} catch(e){return 5} }', 10) @@ -312,12 +321,12 @@ def test_replace(self): def test_char_code_at(self): jsi = JSInterpreter('function f(i){return "test".charCodeAt(i)}') - self.assertEqual(jsi.call_function('f', 0), 116) - self.assertEqual(jsi.call_function('f', 1), 101) - self.assertEqual(jsi.call_function('f', 2), 115) - self.assertEqual(jsi.call_function('f', 3), 116) - self.assertEqual(jsi.call_function('f', 4), None) - self.assertEqual(jsi.call_function('f', 'not_a_number'), 116) + self._test(jsi, 116, args=[0]) + self._test(jsi, 101, args=[1]) + self._test(jsi, 115, args=[2]) + self._test(jsi, 116, args=[3]) + self._test(jsi, None, args=[4]) + self._test(jsi, 116, args=['not_a_number']) def test_bitwise_operators_overflow(self): self._test('function f(){return -524999584 << 5}', 379882496) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 6759d2c467..811f70e689 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -67,7 +67,7 @@ 'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js', '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', - ) + ), ] _NSIG_TESTS = [ diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e51bceef34..7a5e593232 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -259,7 +259,7 @@ class YoutubeDL: consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file - clean_infojson: Remove private fields from the infojson + clean_infojson: Remove internal metadata from the infojson getcomments: Extract video comments. This will not be written to disk unless writeinfojson is also given writeannotations: Write the video annotations to a .annotations.xml file @@ -1902,7 +1902,7 @@ def __process_playlist(self, ie_result, download): continue entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') - if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + if not lazy and 'playlist-index' in self.params['compat_opts']: playlist_index = ie_result['requested_entries'][i] entry_copy = collections.ChainMap(entry, { @@ -2959,8 +2959,7 @@ def print_field(field, actual_field=None, optional=False): print_field('url', 'urls') print_field('thumbnail', optional=True) print_field('description', optional=True) - if filename: - print_field('filename') + print_field('filename') if self.params.get('forceduration') and info_copy.get('duration') is not None: self.to_stdout(formatSeconds(info_copy['duration'])) print_field('format') @@ -3185,7 +3184,6 @@ def existing_video_file(*filepaths): return if info_dict.get('requested_formats') is not None: - requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] if self.params.get('merge_output_format') is None: if (info_dict['ext'] == 'webm' @@ -3212,6 +3210,7 @@ def correct_ext(filename, ext=new_ext): full_filename = correct_ext(full_filename) temp_filename = correct_ext(temp_filename) dl_filename = existing_video_file(full_filename, temp_filename) + info_dict['__real_download'] = False merger = FFmpegMergerPP(self) @@ -3219,12 +3218,12 @@ def correct_ext(filename, ext=new_ext): if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: - for f in requested_formats if fd != FFmpegFD else []: + for f in info_dict['requested_formats'] if fd != FFmpegFD else []: f['filepath'] = fname = prepend_extension( correct_ext(temp_filename, info_dict['ext']), 'f%s' % f['format_id'], info_dict['ext']) downloaded.append(fname) - info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) + info_dict['url'] = '\n'.join(f['url'] for f in info_dict['requested_formats']) success, real_download = self.dl(temp_filename, info_dict) info_dict['__real_download'] = real_download else: @@ -3248,7 +3247,7 @@ def correct_ext(filename, ext=new_ext): f'You have requested downloading multiple formats to stdout {reason}. ' 'The formats will be streamed one after the other') fname = temp_filename - for f in requested_formats: + for f in info_dict['requested_formats']: new_info = dict(info_dict) del new_info['requested_formats'] new_info.update(f) @@ -4109,8 +4108,11 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: + if isinstance(err, urllib.error.HTTPError) and err.code == 404: + self.to_screen(f'[info] {thumb_display_id.title()} does not exist') + else: + self.report_warning(f'Unable to download {thumb_display_id}: {err}') thumbnails.pop(idx) - self.report_warning(f'Unable to download {thumb_display_id}: {err}') if ret and not write_all: break return ret diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 8693e0b4ad..f21e4f7e7b 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1326,3 +1326,7 @@ def get_cookie_header(self, url): cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) self.add_cookie_header(cookie_req) return cookie_req.get_header('Cookie') + + def clear(self, *args, **kwargs): + with contextlib.suppress(KeyError): + return super().clear(*args, **kwargs) diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 477ec3c8a0..a0219a3509 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -49,7 +49,6 @@ class FileDownloader: verbose: Print additional info to stdout. quiet: Do not print messages to stdout. ratelimit: Download speed limit, in bytes/sec. - continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for expected network errors. Default is 0 for API, but 10 for CLI diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index cfe7397845..7d8575c2a4 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -7,9 +7,9 @@ from .external import FFmpegFD from ..utils import ( DownloadError, - str_or_none, - sanitized_Request, WebSocketsWrapper, + sanitized_Request, + str_or_none, try_get, ) diff --git a/yt_dlp/extractor/ciscowebex.py b/yt_dlp/extractor/ciscowebex.py index 0fcf022820..40430505d6 100644 --- a/yt_dlp/extractor/ciscowebex.py +++ b/yt_dlp/extractor/ciscowebex.py @@ -49,7 +49,7 @@ def _real_extract(self, url): 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429)) - if urlh.status == 403: + if urlh.getcode() == 403: if stream['code'] == 53004: self.raise_login_required() if stream['code'] == 53005: @@ -59,7 +59,7 @@ def _real_extract(self, url): 'This video is protected by a password, use the --video-password option', expected=True) raise ExtractorError(f'{self.IE_NAME} said: {stream["code"]} - {stream["message"]}', expected=True) - if urlh.status == 429: + if urlh.getcode() == 429: self.raise_login_required( f'{self.IE_NAME} asks you to solve a CAPTCHA. Solve CAPTCHA in browser and', method='cookies') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f11a673583..9662a7ee1c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -17,6 +17,7 @@ import sys import time import types +import urllib.error import urllib.parse import urllib.request import xml.etree.ElementTree @@ -58,6 +59,7 @@ join_nonempty, js_to_json, mimetype2ext, + netrc_from_content, network_exceptions, orderedSet, parse_bitrate, @@ -72,7 +74,6 @@ smuggle_url, str_or_none, str_to_int, - netrc_from_content, strip_or_none, traverse_obj, truncate_string, diff --git a/yt_dlp/extractor/dumpert.py b/yt_dlp/extractor/dumpert.py old mode 100755 new mode 100644 diff --git a/yt_dlp/extractor/globalplayer.py b/yt_dlp/extractor/globalplayer.py old mode 100755 new mode 100644 diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 0d0ad0bb86..e63714e846 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -238,10 +238,8 @@ class OdnoklassnikiIE(InfoExtractor): def _clear_cookies(self, cdn_url): # Direct http downloads will fail if CDN cookies are set # so we need to reset them after each format extraction - if self._get_cookies('https://notarealsubdomain.mycdn.me/'): - self.cookiejar.clear(domain='.mycdn.me') - if self._get_cookies(cdn_url): - self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) + self.cookiejar.clear(domain='.mycdn.me') + self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) @classmethod def _extract_embed_urls(cls, url, webpage): diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index 2aa0dd870a..c686044fa2 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -488,9 +488,9 @@ def _call_api(self, resource, video_id, query={}, **kwargs): f'{self._API_BASE_URL}/{resource}', video_id, query={'lang': 'pl', 'platform': 'BROWSER', **query}, expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs) - if is_valid(urlh.status): + if is_valid(urlh.getcode()): return document - raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})') + raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.getcode()})') def _parse_video(self, video, with_url=True): info_dict = traverse_obj(video, { diff --git a/yt_dlp/extractor/vidio.py b/yt_dlp/extractor/vidio.py index 770aa284da..23e1aaf202 100644 --- a/yt_dlp/extractor/vidio.py +++ b/yt_dlp/extractor/vidio.py @@ -39,7 +39,7 @@ def is_logged_in(): login_post, login_post_urlh = self._download_webpage_handle( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), expected_status=[302, 401]) - if login_post_urlh.status == 401: + if login_post_urlh.getcode() == 401: if get_element_by_class('onboarding-content-register-popup__title', login_post): raise ExtractorError( 'Unable to log in: The provided email has not registered yet.', expected=True) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4daa4f50e9..11e47904a5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -811,7 +811,7 @@ def _extract_badges(self, badge_list: list): 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, - 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, } label_map = { @@ -821,7 +821,7 @@ def _extract_badges(self, badge_list: list): 'live': BadgeType.LIVE_NOW, 'premium': BadgeType.AVAILABILITY_PREMIUM, 'verified': BadgeType.VERIFIED, - 'official artist channel': BadgeType.VERIFIED + 'official artist channel': BadgeType.VERIFIED, } badges = [] @@ -3935,7 +3935,7 @@ def process_manifest_format(f, proto, client_name, itag): f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) - if self.get_param('verbose'): + if self.get_param('verbose') or all_formats: f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] @@ -4531,7 +4531,7 @@ def process_language(container, base_url, lang_code, sub_name, query): and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) ): upload_date = strftime_or_none( - self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date + self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date info['upload_date'] = upload_date for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: @@ -5071,7 +5071,7 @@ def _get_uncropped(url): last_updated_unix = self._parse_time_text( self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) - info['modified_date'] = strftime_or_none(last_updated_unix, '%Y%m%d') + info['modified_date'] = strftime_or_none(last_updated_unix) info['view_count'] = self._get_count(playlist_stats, 1) if info['view_count'] is None: # 0 is allowed diff --git a/yt_dlp/options.py b/yt_dlp/options.py index b174a24af7..9d6dbec9fc 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1414,8 +1414,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--clean-info-json', '--clean-infojson', action='store_true', dest='clean_infojson', default=None, help=( - 'Remove some private fields such as filenames from the infojson. ' - 'Note that it could still contain some personal information (default)')) + 'Remove some internal metadata such as filenames from the infojson (default)')) filesystem.add_option( '--no-clean-info-json', '--no-clean-infojson', action='store_false', dest='clean_infojson', @@ -1678,8 +1677,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Execute a command, optionally prefixed with when to execute it, separated by a ":". ' 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: after_move). ' 'Same syntax as the output template can be used to pass any field as arguments to the command. ' - 'After download, an additional field "filepath" that contains the final path of the downloaded file ' - 'is also available, and if no fields are passed, %(filepath,_filename|)q is appended to the end of the command. ' + 'If no fields are passed, %(filepath,_filename|)q is appended to the end of the command. ' 'This option can be used multiple times')) postproc.add_option( '--no-exec', diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 1097778f0f..96ac468b1f 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -6,7 +6,7 @@ import urllib.parse import zlib -from ._utils import decode_base_n, preferredencoding +from ._utils import Popen, decode_base_n, preferredencoding from .traversal import traverse_obj from ..dependencies import certifi, websockets @@ -174,3 +174,7 @@ def handle_youtubedl_headers(headers): del filtered_headers['Youtubedl-no-compression'] return filtered_headers + + +def process_communicate_or_kill(p, *args, **kwargs): + return Popen.communicate_or_kill(p, *args, **kwargs) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 28c2785cb0..bc1bc9116c 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -872,12 +872,6 @@ def __init__(self, content): self._parse('-', stream, False) -def process_communicate_or_kill(p, *args, **kwargs): - deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed ' - f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead') - return Popen.communicate_or_kill(p, *args, **kwargs) - - class Popen(subprocess.Popen): if sys.platform == 'win32': _startupinfo = subprocess.STARTUPINFO() @@ -1662,7 +1656,7 @@ def unified_strdate(date_str, day_first=True): def unified_timestamp(date_str, day_first=True): - if date_str is None: + if not isinstance(date_str, str): return None date_str = re.sub(r'\s+', ' ', re.sub( @@ -2454,7 +2448,7 @@ def request_to_url(req): return req -def strftime_or_none(timestamp, date_format, default=None): +def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): datetime_object = None try: if isinstance(timestamp, (int, float)): # unix timestamp From 84078a8b38f403495d00b46654c8750774d821de Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 05:45:09 +0530 Subject: [PATCH 208/501] [core] Fix `filepath` being copied to underlying format dict Closes #6536 --- yt_dlp/YoutubeDL.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7a5e593232..503aafbc77 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3212,6 +3212,8 @@ def correct_ext(filename, ext=new_ext): dl_filename = existing_video_file(full_filename, temp_filename) info_dict['__real_download'] = False + # NOTE: Copy so that original format dicts are not modified + info_dict['requested_formats'] = list(map(dict, info_dict['requested_formats'])) merger = FFmpegMergerPP(self) downloaded = [] From 1619ab3e67d8dc4f86fc7ed292c79345bc0d91a0 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 06:49:56 +0530 Subject: [PATCH 209/501] Bugfix for ebe1b4e34f43c3acad30e4bcb8484681a030c114 --- test/test_YoutubeDL.py | 2 ++ yt_dlp/YoutubeDL.py | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 05dd3ed412..f495fa6d90 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -630,6 +630,7 @@ def test_add_extra_info(self): self.assertEqual(test_dict['playlist'], 'funny videos') outtmpl_info = { + 'id': '1234', 'id': '1234', 'ext': 'mp4', 'width': None, @@ -754,6 +755,7 @@ def expect_same_infodict(out): test('%(ext)c', 'm') test('%(id)d %(id)r', "1234 '1234'") test('%(id)r %(height)r', "'1234' 1080") + test('%(title5)a %(height)a', (R"'\xe1\xe9\xed \U0001d400' 1080", None)) test('%(ext)s-%(ext|def)d', 'mp4-def') test('%(width|0)04d', '0') test('a%(width|b)d', 'ab', outtmpl_na_placeholder='none') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 503aafbc77..bc5c1b95ee 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1328,17 +1328,19 @@ def create_key(outer_mobj): value = str(value)[0] else: fmt = str_fmt - elif fmt[-1] not in 'rs': # numeric + elif fmt[-1] not in 'rsa': # numeric value = float_or_none(value) if value is None: value, fmt = default, 's' if sanitize: + # If value is an object, sanitize might convert it to a string + # So we convert it to repr first if fmt[-1] == 'r': - # If value is an object, sanitize might convert it to a string - # So we convert it to repr first value, fmt = repr(value), str_fmt - if fmt[-1] in 'csr': + elif fmt[-1] == 'a': + value, fmt = ascii(value), str_fmt + if fmt[-1] in 'csra': value = sanitizer(initial_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) From 42f2d40b475db66486a4b4fe5b56751a640db5db Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 08:51:14 +0530 Subject: [PATCH 210/501] Update to ytdl-commit-07af47 [YouTube] Improve fix for ae8ba2c https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e --- test/test_jsinterp.py | 26 ++++++++++++++++++++++++++ yt_dlp/casefold.py | 5 +++++ yt_dlp/jsinterp.py | 2 +- 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/casefold.py diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index e9682ddab0..86928a6a02 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -35,6 +35,21 @@ def test_basic(self): self._test('function f(){42}', None) self._test('var f = function(){return 42;}', 42) + def test_add(self): + self._test('function f(){return 42 + 7;}', 49) + self._test('function f(){return 42 + undefined;}', NaN) + self._test('function f(){return 42 + null;}', 42) + + def test_sub(self): + self._test('function f(){return 42 - 7;}', 35) + self._test('function f(){return 42 - undefined;}', NaN) + self._test('function f(){return 42 - null;}', 42) + + def test_mul(self): + self._test('function f(){return 42 * 7;}', 294) + self._test('function f(){return 42 * undefined;}', NaN) + self._test('function f(){return 42 * null;}', 0) + def test_div(self): jsi = JSInterpreter('function f(a, b){return a / b;}') self._test(jsi, NaN, args=(0, 0)) @@ -42,6 +57,17 @@ def test_div(self): self._test(jsi, float('inf'), args=(2, 0)) self._test(jsi, 0, args=(0, 3)) + def test_mod(self): + self._test('function f(){return 42 % 7;}', 0) + self._test('function f(){return 42 % 0;}', NaN) + self._test('function f(){return 42 % undefined;}', NaN) + + def test_exp(self): + self._test('function f(){return 42 ** 2;}', 1764) + self._test('function f(){return 42 ** undefined;}', NaN) + self._test('function f(){return 42 ** null;}', 1) + self._test('function f(){return undefined ** 42;}', NaN) + def test_calc(self): self._test('function f(a){return 2*a+1;}', 7, args=[3]) diff --git a/yt_dlp/casefold.py b/yt_dlp/casefold.py new file mode 100644 index 0000000000..41a53e5b65 --- /dev/null +++ b/yt_dlp/casefold.py @@ -0,0 +1,5 @@ +import warnings + +warnings.warn(DeprecationWarning(f'{__name__} is deprecated')) + +casefold = str.casefold diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 9c280fb86f..bda3fb4599 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -812,9 +812,9 @@ def extract_function_code(self, funcname): \((?P<args>[^)]*)\)\s* (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code')) if func_m is None: raise self.Exception(f'Could not find JS function "{funcname}"') + code, _ = self._separate_at_paren(func_m.group('code')) return [x.strip() for x in func_m.group('args').split(',')], code def extract_function(self, funcname): From d1b21561497b6bbb8ff1202e63f48eb41bd315af Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Wed, 21 Jun 2023 04:02:40 +0000 Subject: [PATCH 211/501] Release 2023.06.21 Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 +- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 +- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 +- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 +- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 +- .github/ISSUE_TEMPLATE/6_question.yml | 8 +- CONTRIBUTORS | 46 +++ Changelog.md | 280 ++++++++++++++++++ README.md | 2 +- supportedsites.md | 111 +++++-- yt_dlp/version.py | 4 +- 11 files changed, 439 insertions(+), 52 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 77b777d5a9..351454b127 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.03.04 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.03.04, Current version: 2023.03.04 - yt-dlp is up to date (2023.03.04) + Latest version: 2023.06.21, Current version: 2023.06.21 + yt-dlp is up to date (2023.06.21) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 890df48fac..b2a613e2f9 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.03.04 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.03.04, Current version: 2023.03.04 - yt-dlp is up to date (2023.03.04) + Latest version: 2023.06.21, Current version: 2023.06.21 + yt-dlp is up to date (2023.06.21) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index ef9bda36a8..c100561eb5 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.03.04 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.03.04, Current version: 2023.03.04 - yt-dlp is up to date (2023.03.04) + Latest version: 2023.06.21, Current version: 2023.06.21 + yt-dlp is up to date (2023.06.21) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 122dda4f26..e97d7b5073 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.03.04 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.03.04, Current version: 2023.03.04 - yt-dlp is up to date (2023.03.04) + Latest version: 2023.06.21, Current version: 2023.06.21 + yt-dlp is up to date (2023.06.21) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index b17c656587..a44612d795 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.03.04 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.03.04, Current version: 2023.03.04 - yt-dlp is up to date (2023.03.04) + Latest version: 2023.06.21, Current version: 2023.06.21 + yt-dlp is up to date (2023.06.21) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index c694e5a5a1..a15a469680 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.03.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.03.04 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.03.04, Current version: 2023.03.04 - yt-dlp is up to date (2023.03.04) + Latest version: 2023.06.21, Current version: 2023.06.21 + yt-dlp is up to date (2023.06.21) <more lines> render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index d6ba617b75..3b35895d93 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -409,3 +409,49 @@ Hill-98 LXYan2333 mushbite venkata-krishnas +7vlad7 +alexklapheke +arobase-che +bepvte +bergoid +blmarket +brandon-dacrib +c-basalt +CoryTibbettsDev +Cyberes +D0LLYNH0 +danog +DataGhost +falbrechtskirchinger +foreignBlade +garret1317 +hasezoey +hoaluvn +ItzMaxTV +ivanskodje +jo-nike +kangalio +linsui +makew0rld +menschel +mikf +mrscrapy +NDagestad +Neurognostic +NextFire +nick-cd +permunkle +pzhlkj6612 +ringus1 +rjy +Schmoaaaaah +sjthespian +theperfectpunk +toomyzoom +truedread +TxI5 +unbeatable-101 +vampirefrog +vidiot720 +viktor-enzell +zhgwn diff --git a/Changelog.md b/Changelog.md index 186998edee..d7a1cb4953 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,286 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.06.21 + +#### Important changes +- YouTube: Improved throttling and signature fixes + +#### Core changes +- [Add `--compat-option playlist-match-filter`](https://github.com/yt-dlp/yt-dlp/commit/93b39cdbd9dcf351bfa0c4ee252805b4617fdca9) by [pukkandan](https://github.com/pukkandan) +- [Add `--no-quiet`](https://github.com/yt-dlp/yt-dlp/commit/d669772c65e8630162fd6555d0a578b246591921) by [pukkandan](https://github.com/pukkandan) +- [Add option `--color`](https://github.com/yt-dlp/yt-dlp/commit/8417f26b8a819cd7ffcd4e000ca3e45033e670fb) ([#6904](https://github.com/yt-dlp/yt-dlp/issues/6904)) by [Grub4K](https://github.com/Grub4K) +- [Add option `--netrc-cmd`](https://github.com/yt-dlp/yt-dlp/commit/db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb) ([#6682](https://github.com/yt-dlp/yt-dlp/issues/6682)) by [NDagestad](https://github.com/NDagestad), [pukkandan](https://github.com/pukkandan) +- [Add option `--xff`](https://github.com/yt-dlp/yt-dlp/commit/c16644642b08e2bf4130a6c5fa01395d8718c990) by [pukkandan](https://github.com/pukkandan) +- [Auto-select default format in `-f-`](https://github.com/yt-dlp/yt-dlp/commit/372a0f3b9dadd1e52234b498aa4c7040ef868c7d) ([#7101](https://github.com/yt-dlp/yt-dlp/issues/7101)) by [ivanskodje](https://github.com/ivanskodje), [pukkandan](https://github.com/pukkandan) +- [Deprecate internal `Youtubedl-no-compression` header](https://github.com/yt-dlp/yt-dlp/commit/955c89584b66fcd0fcfab3e611f1edeb1ca63886) ([#6876](https://github.com/yt-dlp/yt-dlp/issues/6876)) by [coletdjnz](https://github.com/coletdjnz) +- [Do not translate newlines in `--print-to-file`](https://github.com/yt-dlp/yt-dlp/commit/9874e82b5a61582169300bea561b3e8899ad1ef7) by [pukkandan](https://github.com/pukkandan) +- [Ensure pre-processor errors do not block `--print`](https://github.com/yt-dlp/yt-dlp/commit/f005a35aa7e4f67a0c603a946c0dd714c151b2d6) by [pukkandan](https://github.com/pukkandan) (With fixes in [17ba434](https://github.com/yt-dlp/yt-dlp/commit/17ba4343cf99701692a7f4798fd42b50f644faba)) +- [Fix `filepath` being copied to underlying format dict](https://github.com/yt-dlp/yt-dlp/commit/84078a8b38f403495d00b46654c8750774d821de) by [pukkandan](https://github.com/pukkandan) +- [Improve HTTP redirect handling](https://github.com/yt-dlp/yt-dlp/commit/08916a49c777cb6e000eec092881eb93ec22076c) ([#7094](https://github.com/yt-dlp/yt-dlp/issues/7094)) by [coletdjnz](https://github.com/coletdjnz) +- [Populate `filename` and `urls` fields at all stages of `--print`](https://github.com/yt-dlp/yt-dlp/commit/170605840ea9d5ad75da6576485ea7d125b428ee) by [pukkandan](https://github.com/pukkandan) (With fixes in [b5f61b6](https://github.com/yt-dlp/yt-dlp/commit/b5f61b69d4561b81fc98c226b176f0c15493e688)) +- [Relaxed validation for numeric format filters](https://github.com/yt-dlp/yt-dlp/commit/c3f624ef0a5d7a6ae1c5ffeb243087e9fc7d79dc) by [pukkandan](https://github.com/pukkandan) +- [Support decoding multiple content encodings](https://github.com/yt-dlp/yt-dlp/commit/daafbf49b3482edae4d70dd37070be99742a926e) ([#7142](https://github.com/yt-dlp/yt-dlp/issues/7142)) by [coletdjnz](https://github.com/coletdjnz) +- [Support loading info.json with a list at it's root](https://github.com/yt-dlp/yt-dlp/commit/ab1de9cb1e39cf421c2b7dc6756c6ff1955bb313) by [pukkandan](https://github.com/pukkandan) +- [Workaround erroneous urllib Windows proxy parsing](https://github.com/yt-dlp/yt-dlp/commit/3f66b6fe50f8d5b545712f8b19d5ae62f5373980) ([#7092](https://github.com/yt-dlp/yt-dlp/issues/7092)) by [coletdjnz](https://github.com/coletdjnz) +- **cookies** + - [Defer extraction of v11 key from keyring](https://github.com/yt-dlp/yt-dlp/commit/9b7a48abd1b187eae1e3f6c9839c47d43ccec00b) by [Grub4K](https://github.com/Grub4K) + - [Move `YoutubeDLCookieJar` to cookies module](https://github.com/yt-dlp/yt-dlp/commit/b87e01c123fd560b6a674ce00f45a9459d82d98a) ([#7091](https://github.com/yt-dlp/yt-dlp/issues/7091)) by [coletdjnz](https://github.com/coletdjnz) + - [Support custom Safari cookies path](https://github.com/yt-dlp/yt-dlp/commit/a58182b75a05fe0a10c5e94a536711d3ade19c20) ([#6783](https://github.com/yt-dlp/yt-dlp/issues/6783)) by [NextFire](https://github.com/NextFire) + - [Update for chromium changes](https://github.com/yt-dlp/yt-dlp/commit/b38d4c941d1993ab27e4c0f8e024e23c2ec0f8f8) ([#6897](https://github.com/yt-dlp/yt-dlp/issues/6897)) by [mbway](https://github.com/mbway) +- **Cryptodome**: [Fix `__bool__`](https://github.com/yt-dlp/yt-dlp/commit/98ac902c4979e4529b166e873473bef42baa2e3e) by [pukkandan](https://github.com/pukkandan) +- **jsinterp** + - [Do not compile regex](https://github.com/yt-dlp/yt-dlp/commit/7aeda6cc9e73ada0b0a0b6a6748c66bef63a20a8) by [pukkandan](https://github.com/pukkandan) + - [Fix division](https://github.com/yt-dlp/yt-dlp/commit/b4a252fba81f53631c07ca40ce7583f5d19a8a36) ([#7279](https://github.com/yt-dlp/yt-dlp/issues/7279)) by [bashonly](https://github.com/bashonly) + - [Fix global object extraction](https://github.com/yt-dlp/yt-dlp/commit/01aba2519a0884ef17d5f85608dbd2a455577147) by [pukkandan](https://github.com/pukkandan) + - [Handle `NaN` in bitwise operators](https://github.com/yt-dlp/yt-dlp/commit/1d7656184c6b8aa46b29149893894b3c24f1df00) by [pukkandan](https://github.com/pukkandan) + - [Handle negative numbers better](https://github.com/yt-dlp/yt-dlp/commit/7cf51f21916292cd80bdeceb37489f5322f166dd) by [pukkandan](https://github.com/pukkandan) +- **outtmpl** + - [Allow `\n` in replacements and default.](https://github.com/yt-dlp/yt-dlp/commit/78fde6e3398ff11e5d383a66b28664badeab5180) by [pukkandan](https://github.com/pukkandan) + - [Fix some minor bugs](https://github.com/yt-dlp/yt-dlp/commit/ebe1b4e34f43c3acad30e4bcb8484681a030c114) by [pukkandan](https://github.com/pukkandan) (With fixes in [1619ab3](https://github.com/yt-dlp/yt-dlp/commit/1619ab3e67d8dc4f86fc7ed292c79345bc0d91a0)) + - [Support `str.format` syntax inside replacements](https://github.com/yt-dlp/yt-dlp/commit/ec9311c41b111110bc52cfbd6ea682c6fb23f77a) by [pukkandan](https://github.com/pukkandan) +- **update** + - [Better error handling](https://github.com/yt-dlp/yt-dlp/commit/d2e84d5eb01c66fc5304e8566348d65a7be24ed7) by [pukkandan](https://github.com/pukkandan) + - [Do not restart into versions without `--update-to`](https://github.com/yt-dlp/yt-dlp/commit/02948a17d903f544363bb20b51a6d8baed7bba08) by [pukkandan](https://github.com/pukkandan) + - [Implement `--update-to` repo](https://github.com/yt-dlp/yt-dlp/commit/665472a7de3880578c0b7b3f95c71570c056368e) by [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- **upstream** + - [Merged with youtube-dl 07af47](https://github.com/yt-dlp/yt-dlp/commit/42f2d40b475db66486a4b4fe5b56751a640db5db) by [pukkandan](https://github.com/pukkandan) + - [Merged with youtube-dl d1c6c5](https://github.com/yt-dlp/yt-dlp/commit/4823ec9f461512daa1b8ab362893bb86a6320b26) by [pukkandan](https://github.com/pukkandan) (With fixes in [edbe5b5](https://github.com/yt-dlp/yt-dlp/commit/edbe5b589dd0860a67b4e03f58db3cd2539d91c2) by [bashonly](https://github.com/bashonly)) +- **utils** + - `FormatSorter`: [Improve `size` and `br`](https://github.com/yt-dlp/yt-dlp/commit/eedda5252c05327748dede204a8fccafa0288118) by [pukkandan](https://github.com/pukkandan), [u-spec-png](https://github.com/u-spec-png) + - `js_to_json`: [Implement template strings](https://github.com/yt-dlp/yt-dlp/commit/0898c5c8ccadfc404472456a7a7751b72afebadd) ([#6623](https://github.com/yt-dlp/yt-dlp/issues/6623)) by [Grub4K](https://github.com/Grub4K) + - `locked_file`: [Fix for virtiofs](https://github.com/yt-dlp/yt-dlp/commit/45998b3e371b819ce0dbe50da703809a048cc2fe) ([#6840](https://github.com/yt-dlp/yt-dlp/issues/6840)) by [brandon-dacrib](https://github.com/brandon-dacrib) + - `strftime_or_none`: [Handle negative timestamps](https://github.com/yt-dlp/yt-dlp/commit/a35af4306d24c56c6358f89cdf204860d1cd62b4) by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan) + - `traverse_obj` + - [Allow iterables in traversal](https://github.com/yt-dlp/yt-dlp/commit/21b5ec86c2c37d10c5bb97edd7051d3aac16bb3e) ([#6902](https://github.com/yt-dlp/yt-dlp/issues/6902)) by [Grub4K](https://github.com/Grub4K) + - [More fixes](https://github.com/yt-dlp/yt-dlp/commit/b079c26f0af8085bccdadc72c61c8164ca5ab0f8) ([#6959](https://github.com/yt-dlp/yt-dlp/issues/6959)) by [Grub4K](https://github.com/Grub4K) + - `write_string`: [Fix noconsole behavior](https://github.com/yt-dlp/yt-dlp/commit/3b479100df02e20dd949e046003ae96ddbfced57) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Do not exit early for unsuitable `url_result`](https://github.com/yt-dlp/yt-dlp/commit/baa922b5c74b10e3b86ff5e6cf6529b3aae8efab) by [pukkandan](https://github.com/pukkandan) +- [Do not warn for invalid chapter data in description](https://github.com/yt-dlp/yt-dlp/commit/84ffeb7d5e72e3829319ba7720a8480fc4c7503b) by [pukkandan](https://github.com/pukkandan) +- [Extract more metadata from ISM](https://github.com/yt-dlp/yt-dlp/commit/f68434cc74cfd3db01b266476a2eac8329fbb267) by [pukkandan](https://github.com/pukkandan) +- **abematv**: [Add fallback for title and description extraction and extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/c449c0655d7c8549e6e1389c26b628053b253d39) ([#6994](https://github.com/yt-dlp/yt-dlp/issues/6994)) by [Lesmiscore](https://github.com/Lesmiscore) +- **acast**: [Support embeds](https://github.com/yt-dlp/yt-dlp/commit/c91ac833ea99b00506e470a44cf930e4e23378c9) ([#7212](https://github.com/yt-dlp/yt-dlp/issues/7212)) by [pabs3](https://github.com/pabs3) +- **adobepass**: [Handle `Charter_Direct` MSO as `Spectrum`](https://github.com/yt-dlp/yt-dlp/commit/ea0570820336a0fe9c3b530d1b0d1e59313274f4) ([#6824](https://github.com/yt-dlp/yt-dlp/issues/6824)) by [bashonly](https://github.com/bashonly) +- **aeonco**: [Support Youtube embeds](https://github.com/yt-dlp/yt-dlp/commit/ed81b74802b4247ee8d9dc0ef87eb52baefede1c) ([#6591](https://github.com/yt-dlp/yt-dlp/issues/6591)) by [alexklapheke](https://github.com/alexklapheke) +- **afreecatv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fdd69db38924c38194ef236b26325d66ac815c88) ([#6283](https://github.com/yt-dlp/yt-dlp/issues/6283)) by [blmarket](https://github.com/blmarket) +- **ARDBetaMediathek**: [Add thumbnail](https://github.com/yt-dlp/yt-dlp/commit/f78eb41e1c0f1dcdb10317358a26bf541dc7ee15) ([#6890](https://github.com/yt-dlp/yt-dlp/issues/6890)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **bibeltv**: [Fix extraction, support live streams and series](https://github.com/yt-dlp/yt-dlp/commit/4ad58667c102bd82a7c4cca8aa395ec1682e3b4c) ([#6505](https://github.com/yt-dlp/yt-dlp/issues/6505)) by [flashdagger](https://github.com/flashdagger) +- **bilibili** + - [Support festival videos](https://github.com/yt-dlp/yt-dlp/commit/ab29e47029e2f5b48abbbab78e82faf7cf6e9506) ([#6547](https://github.com/yt-dlp/yt-dlp/issues/6547)) by [qbnu](https://github.com/qbnu) + - SpaceVideo: [Extract signature](https://github.com/yt-dlp/yt-dlp/commit/6f10cdcf7eeaeae5b75e0a4428cd649c156a2d83) ([#7149](https://github.com/yt-dlp/yt-dlp/issues/7149)) by [elyse0](https://github.com/elyse0) +- **biliIntl**: [Add comment extraction](https://github.com/yt-dlp/yt-dlp/commit/b093c38cc9f26b59a8504211d792f053142c847d) ([#6079](https://github.com/yt-dlp/yt-dlp/issues/6079)) by [HobbyistDev](https://github.com/HobbyistDev) +- **bitchute**: [Add more fallback subdomains](https://github.com/yt-dlp/yt-dlp/commit/0c4e0fbcade0fc92d14c2a6d63e360fe067f6192) ([#6907](https://github.com/yt-dlp/yt-dlp/issues/6907)) by [Neurognostic](https://github.com/Neurognostic) +- **booyah**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/f7f7a877bf8e87fd4eb0ad2494ad948ca7691114) by [pukkandan](https://github.com/pukkandan) +- **BrainPOP**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/979568f26ece80bca72b48f0dd57d676e431059a) ([#6106](https://github.com/yt-dlp/yt-dlp/issues/6106)) by [MinePlayersPE](https://github.com/MinePlayersPE) +- **bravotv** + - [Detect DRM](https://github.com/yt-dlp/yt-dlp/commit/1fe5bf240e6ade487d18079a62aa36bcc440a27a) ([#7171](https://github.com/yt-dlp/yt-dlp/issues/7171)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/06966cb8966b9aa4f60ab9c44c182a057d4ca3a3) ([#6568](https://github.com/yt-dlp/yt-dlp/issues/6568)) by [bashonly](https://github.com/bashonly) +- **camfm**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/4cbfa570a1b9bd65b0f48770693377e8d842dcb0) ([#7083](https://github.com/yt-dlp/yt-dlp/issues/7083)) by [garret1317](https://github.com/garret1317) +- **cbc** + - [Fix live extractor, playlist `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/7a7b1376fbce0067cf37566bb47131bc0022638d) ([#6625](https://github.com/yt-dlp/yt-dlp/issues/6625)) by [makew0rld](https://github.com/makew0rld) + - [Ignore 426 from API](https://github.com/yt-dlp/yt-dlp/commit/4afb208cf07b59291ae3b0c4efc83945ee5b8812) ([#6781](https://github.com/yt-dlp/yt-dlp/issues/6781)) by [jo-nike](https://github.com/jo-nike) + - gem: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/871c907454693940cb56906ed9ea49fcb7154829) ([#6499](https://github.com/yt-dlp/yt-dlp/issues/6499)) by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +- **cbs**: [Add `ParamountPressExpress` extractor](https://github.com/yt-dlp/yt-dlp/commit/44369c9afa996e14e9f466754481d878811b5b4a) ([#6604](https://github.com/yt-dlp/yt-dlp/issues/6604)) by [bashonly](https://github.com/bashonly) +- **cbsnews**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/f6e43d6fa9804c24525e1fed0a87782754dab7ed) ([#6681](https://github.com/yt-dlp/yt-dlp/issues/6681)) by [bashonly](https://github.com/bashonly) +- **chilloutzone**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6f4fc5660f40f3458882a8f51601eae4af7be609) ([#6445](https://github.com/yt-dlp/yt-dlp/issues/6445)) by [bashonly](https://github.com/bashonly) +- **clipchamp**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2f07c4c1da4361af213e5791279b9d152d2e4ce3) ([#6978](https://github.com/yt-dlp/yt-dlp/issues/6978)) by [bashonly](https://github.com/bashonly) +- **comedycentral**: [Add support for movies](https://github.com/yt-dlp/yt-dlp/commit/66468bbf49562ff82670cbbd456c5e8448a6df34) ([#7108](https://github.com/yt-dlp/yt-dlp/issues/7108)) by [sqrtNOT](https://github.com/sqrtNOT) +- **crtvg**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/26c517b29c8727e47948d6fff749d5297f0efb60) ([#7168](https://github.com/yt-dlp/yt-dlp/issues/7168)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **crunchyroll**: [Rework with support for movies, music and artists](https://github.com/yt-dlp/yt-dlp/commit/032de83ea9ff2f4977d9c71a93bbc1775597b762) ([#6237](https://github.com/yt-dlp/yt-dlp/issues/6237)) by [Grub4K](https://github.com/Grub4K) +- **dacast**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/c25cac2f8e5fbac2737a426d7778fd2f0efc5381) ([#6896](https://github.com/yt-dlp/yt-dlp/issues/6896)) by [bashonly](https://github.com/bashonly) +- **daftsex**: [Update domain and embed player url](https://github.com/yt-dlp/yt-dlp/commit/fc5a7f9b27d2a89b1f3ca7d33a95301c21d832cd) ([#5966](https://github.com/yt-dlp/yt-dlp/issues/5966)) by [JChris246](https://github.com/JChris246) +- **DigitalConcertHall**: [Support films](https://github.com/yt-dlp/yt-dlp/commit/55ed4ff73487feb3177b037dfc2ea527e777da3e) ([#7202](https://github.com/yt-dlp/yt-dlp/issues/7202)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **discogs**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6daaf21092888beff11b807cd46f832f1f9c46a0) ([#6624](https://github.com/yt-dlp/yt-dlp/issues/6624)) by [rjy](https://github.com/rjy) +- **dlf**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b423b6a48e0b19260bc95ab7d72d2138d7f124dc) ([#6697](https://github.com/yt-dlp/yt-dlp/issues/6697)) by [nick-cd](https://github.com/nick-cd) +- **drtv**: [Fix radio page extraction](https://github.com/yt-dlp/yt-dlp/commit/9a06b7b1891b48cebbe275652ae8025a36d97d97) ([#6552](https://github.com/yt-dlp/yt-dlp/issues/6552)) by [viktor-enzell](https://github.com/viktor-enzell) +- **Dumpert**: [Fix m3u8 and support new URL pattern](https://github.com/yt-dlp/yt-dlp/commit/f8ae441501596733e2b967430471643a1d7cacb8) ([#6091](https://github.com/yt-dlp/yt-dlp/issues/6091)) by [DataGhost](https://github.com/DataGhost), [pukkandan](https://github.com/pukkandan) +- **elevensports**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ecfe47973f6603b5367fe2cc3c65274627d94516) ([#7172](https://github.com/yt-dlp/yt-dlp/issues/7172)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **ettutv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/83465fc4100a2fb2c188898fbc2f3021f6a9b4dd) ([#6579](https://github.com/yt-dlp/yt-dlp/issues/6579)) by [elyse0](https://github.com/elyse0) +- **europarl**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/03789976d301eaed3e957dbc041573098f6af059) ([#7114](https://github.com/yt-dlp/yt-dlp/issues/7114)) by [HobbyistDev](https://github.com/HobbyistDev) +- **eurosport**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/45e87ea106ad37b2a002663fa30ee41ce97b16cd) ([#7076](https://github.com/yt-dlp/yt-dlp/issues/7076)) by [HobbyistDev](https://github.com/HobbyistDev) +- **facebook**: [Fix metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/3b52a606881e6adadc33444abdeacce562b79330) ([#6856](https://github.com/yt-dlp/yt-dlp/issues/6856)) by [ringus1](https://github.com/ringus1) +- **foxnews**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/97d60ad8cd6c99f01e463a9acfce8693aff2a609) ([#7222](https://github.com/yt-dlp/yt-dlp/issues/7222)) by [bashonly](https://github.com/bashonly) +- **funker530**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/cab94a0cd8b6d3fffed5a6faff030274adbed182) ([#7291](https://github.com/yt-dlp/yt-dlp/issues/7291)) by [Cyberes](https://github.com/Cyberes) +- **generic** + - [Accept values for `fragment_query`, `variant_query`](https://github.com/yt-dlp/yt-dlp/commit/5cc0a8fd2e9fec50026fb92170b57993af939e4a) ([#6600](https://github.com/yt-dlp/yt-dlp/issues/6600)) by [bashonly](https://github.com/bashonly) (With fixes in [9bfe0d1](https://github.com/yt-dlp/yt-dlp/commit/9bfe0d15bd7dbdc6b0e6378fa9f5e2e289b2373b)) + - [Add extractor-args `hls_key`, `variant_query`](https://github.com/yt-dlp/yt-dlp/commit/c2e0fc40a73dd85ab3920f977f579d475e66ef59) ([#6567](https://github.com/yt-dlp/yt-dlp/issues/6567)) by [bashonly](https://github.com/bashonly) + - [Attempt to detect live HLS](https://github.com/yt-dlp/yt-dlp/commit/93e7c6995e07dafb9dcc06c0d06acf6c5bdfecc5) ([#6775](https://github.com/yt-dlp/yt-dlp/issues/6775)) by [bashonly](https://github.com/bashonly) +- **genius**: [Add support for articles](https://github.com/yt-dlp/yt-dlp/commit/460da07439718d9af1e3661da2a23e05a913a2e6) ([#6474](https://github.com/yt-dlp/yt-dlp/issues/6474)) by [bashonly](https://github.com/bashonly) +- **globalplayer**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/30647668a92a0ca5cd108776804baac0996bd9f7) ([#6903](https://github.com/yt-dlp/yt-dlp/issues/6903)) by [garret1317](https://github.com/garret1317) +- **gmanetwork**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2d97d154fe4fb84fe2ed3a4e1ed5819e89b71e88) ([#5945](https://github.com/yt-dlp/yt-dlp/issues/5945)) by [HobbyistDev](https://github.com/HobbyistDev) +- **gronkh**: [Extract duration and chapters](https://github.com/yt-dlp/yt-dlp/commit/9c92b803fa24e48543ce969468d5404376e315b7) ([#6817](https://github.com/yt-dlp/yt-dlp/issues/6817)) by [satan1st](https://github.com/satan1st) +- **hentaistigma**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/04f8018a0544736a18494bc3899d06b05b78fae6) by [pukkandan](https://github.com/pukkandan) +- **hidive**: [Fix login](https://github.com/yt-dlp/yt-dlp/commit/e6ab678e36c40ded0aae305bbb866cdab554d417) by [pukkandan](https://github.com/pukkandan) +- **hollywoodreporter**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/6bdb64e2a2a6d504d8ce1dc830fbfb8a7f199c63) ([#6614](https://github.com/yt-dlp/yt-dlp/issues/6614)) by [bashonly](https://github.com/bashonly) +- **hotstar**: [Support `/shows/` URLs](https://github.com/yt-dlp/yt-dlp/commit/7f8ddebbb51c9fd4a347306332a718ba41b371b8) ([#7225](https://github.com/yt-dlp/yt-dlp/issues/7225)) by [bashonly](https://github.com/bashonly) +- **hrefli**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/7e35526d5b970a034b9d76215ee3e4bd7631edcd) ([#6762](https://github.com/yt-dlp/yt-dlp/issues/6762)) by [selfisekai](https://github.com/selfisekai) +- **idolplus**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5c14b213679ed4401288bdc86ae696932e219222) ([#6732](https://github.com/yt-dlp/yt-dlp/issues/6732)) by [ping](https://github.com/ping) +- **iq**: [Set more language codes](https://github.com/yt-dlp/yt-dlp/commit/2d5cae9636714ff922d28c548c349d5f2b48f317) ([#6476](https://github.com/yt-dlp/yt-dlp/issues/6476)) by [D0LLYNH0](https://github.com/D0LLYNH0) +- **iwara** + - [Accept old URLs](https://github.com/yt-dlp/yt-dlp/commit/ab92d8651c48d247dfb7d3f0a824cc986e47c7ed) by [Lesmiscore](https://github.com/Lesmiscore) + - [Fix authentication](https://github.com/yt-dlp/yt-dlp/commit/0a5d7c39e17bb9bd50c9db42bcad40eb82d7f784) ([#7137](https://github.com/yt-dlp/yt-dlp/issues/7137)) by [toomyzoom](https://github.com/toomyzoom) + - [Fix format sorting](https://github.com/yt-dlp/yt-dlp/commit/56793f74c36899742d7abd52afb0deca97d469e1) ([#6651](https://github.com/yt-dlp/yt-dlp/issues/6651)) by [hasezoey](https://github.com/hasezoey) + - [Fix typo](https://github.com/yt-dlp/yt-dlp/commit/d1483ec693c79f0b4ddf493870bcb840aca4da08) by [Lesmiscore](https://github.com/Lesmiscore) + - [Implement login](https://github.com/yt-dlp/yt-dlp/commit/21b9413cf7dd4830b2ece57af21589dd4538fc52) ([#6721](https://github.com/yt-dlp/yt-dlp/issues/6721)) by [toomyzoom](https://github.com/toomyzoom) + - [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/c14af7a741931b364bab3d9546c0f4359f318f8c) ([#6557](https://github.com/yt-dlp/yt-dlp/issues/6557)) by [Lesmiscore](https://github.com/Lesmiscore) + - [Report private videos](https://github.com/yt-dlp/yt-dlp/commit/95a383be1b6fb00c92ee3fb091732c4f6009acb6) ([#6641](https://github.com/yt-dlp/yt-dlp/issues/6641)) by [Lesmiscore](https://github.com/Lesmiscore) +- **JStream**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3459d3c5af3b2572ed51e8ecfda6c11022a838c6) ([#6252](https://github.com/yt-dlp/yt-dlp/issues/6252)) by [Lesmiscore](https://github.com/Lesmiscore) +- **jwplatform**: [Update `_extract_embed_urls`](https://github.com/yt-dlp/yt-dlp/commit/cf9fd52fabe71d6e7c30d3ea525029ffa561fc9c) ([#6383](https://github.com/yt-dlp/yt-dlp/issues/6383)) by [carusocr](https://github.com/carusocr) +- **kick**: [Make initial request non-fatal](https://github.com/yt-dlp/yt-dlp/commit/0a6918a4a1431960181d8c50e0bbbcb0afbaff9a) by [bashonly](https://github.com/bashonly) +- **LastFM**: [Rewrite playlist extraction](https://github.com/yt-dlp/yt-dlp/commit/026435714cb7c39613a0d7d2acd15d3823b78d94) ([#6379](https://github.com/yt-dlp/yt-dlp/issues/6379)) by [hatienl0i261299](https://github.com/hatienl0i261299), [pukkandan](https://github.com/pukkandan) +- **lbry**: [Extract original quality formats](https://github.com/yt-dlp/yt-dlp/commit/44c0d66442b568d9e1359e669d8b029b08a77fa7) ([#7257](https://github.com/yt-dlp/yt-dlp/issues/7257)) by [bashonly](https://github.com/bashonly) +- **line**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/faa0332ed69e070cf3bd31390589a596e962f392) ([#6734](https://github.com/yt-dlp/yt-dlp/issues/6734)) by [sian1468](https://github.com/sian1468) +- **livestream**: [Support videos with account id](https://github.com/yt-dlp/yt-dlp/commit/bfdf144c7e5d7a93fbfa9d8e65598c72bf2b542a) ([#6324](https://github.com/yt-dlp/yt-dlp/issues/6324)) by [theperfectpunk](https://github.com/theperfectpunk) +- **medaltv**: [Fix clips](https://github.com/yt-dlp/yt-dlp/commit/1e3c2b6ec28d7ab5e31341fa93c47b65be4fbff4) ([#6502](https://github.com/yt-dlp/yt-dlp/issues/6502)) by [xenova](https://github.com/xenova) +- **mediastream**: [Improve `WinSports` and embed extraction](https://github.com/yt-dlp/yt-dlp/commit/03025b6e105139d01cd415ddc51fd692957fd2ba) ([#6426](https://github.com/yt-dlp/yt-dlp/issues/6426)) by [bashonly](https://github.com/bashonly) +- **mgtv**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/59d9fe08312bbb76ee26238d207a8ca35410a48d) ([#7234](https://github.com/yt-dlp/yt-dlp/issues/7234)) by [bashonly](https://github.com/bashonly) +- **Mzaalo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/dc3c44f349ba85af320e706e2a27ad81a78b1c6e) ([#7163](https://github.com/yt-dlp/yt-dlp/issues/7163)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **nbc**: [Fix `NBCStations` direct mp4 formats](https://github.com/yt-dlp/yt-dlp/commit/9be0fe1fd967f62cbf3c60bd14e1021a70abc147) ([#6637](https://github.com/yt-dlp/yt-dlp/issues/6637)) by [bashonly](https://github.com/bashonly) +- **nebula**: [Add `beta.nebula.tv`](https://github.com/yt-dlp/yt-dlp/commit/cbfe2e5cbe0f4649a91e323a82b8f5f774f36662) ([#6516](https://github.com/yt-dlp/yt-dlp/issues/6516)) by [unbeatable-101](https://github.com/unbeatable-101) +- **nekohacker**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/489f51279d00318018478fd7461eddbe3b45297e) ([#7003](https://github.com/yt-dlp/yt-dlp/issues/7003)) by [hasezoey](https://github.com/hasezoey) +- **nhk** + - [Add `NhkRadiru` extractor](https://github.com/yt-dlp/yt-dlp/commit/8f0be90ecb3b8d862397177bb226f17b245ef933) ([#6819](https://github.com/yt-dlp/yt-dlp/issues/6819)) by [garret1317](https://github.com/garret1317) + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/f41b949a2ef646fbc36375febbe3f0c19d742c0f) ([#7180](https://github.com/yt-dlp/yt-dlp/issues/7180)) by [menschel](https://github.com/menschel), [sjthespian](https://github.com/sjthespian) + - `NhkRadiruLive`: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/81c8b9bdd9841b72cbfc1bbff9dab5fb4aa038b0) ([#7332](https://github.com/yt-dlp/yt-dlp/issues/7332)) by [garret1317](https://github.com/garret1317) +- **niconico** + - [Download comments from the new endpoint](https://github.com/yt-dlp/yt-dlp/commit/52ecc33e221f7de7eb6fed6c22489f0c5fdd2c6d) ([#6773](https://github.com/yt-dlp/yt-dlp/issues/6773)) by [Lesmiscore](https://github.com/Lesmiscore) + - live: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f8f9250fe280d37f0988646cd5cc0072f4d33a6d) ([#5764](https://github.com/yt-dlp/yt-dlp/issues/5764)) by [Lesmiscore](https://github.com/Lesmiscore) + - series: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/c86e433c35fe5da6cb29f3539eef97497f84ed38) ([#6898](https://github.com/yt-dlp/yt-dlp/issues/6898)) by [sqrtNOT](https://github.com/sqrtNOT) +- **nubilesporn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/d4e6ef40772e0560a8ed33b844ef7549e86837be) ([#6231](https://github.com/yt-dlp/yt-dlp/issues/6231)) by [permunkle](https://github.com/permunkle) +- **odnoklassniki**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/1a2eb5bda51d8b7a78a65acebf72a0dcf9da196b) ([#7217](https://github.com/yt-dlp/yt-dlp/issues/7217)) by [bashonly](https://github.com/bashonly) +- **opencast** + - [Add ltitools to `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3588be59cee429a0ab5c4ceb2f162298bb44147d) ([#6371](https://github.com/yt-dlp/yt-dlp/issues/6371)) by [C0D3D3V](https://github.com/C0D3D3V) + - [Fix format bug](https://github.com/yt-dlp/yt-dlp/commit/89dbf0848370deaa55af88c3593a2a264124caf5) ([#6512](https://github.com/yt-dlp/yt-dlp/issues/6512)) by [C0D3D3V](https://github.com/C0D3D3V) +- **owncloud**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c6d4b82a8b8bce59b1c9ce5e6d349ea428dac0a7) ([#6533](https://github.com/yt-dlp/yt-dlp/issues/6533)) by [C0D3D3V](https://github.com/C0D3D3V) +- **Parler**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/80ea6d3dea8483cddd39fc89b5ee1fc06670c33c) ([#6446](https://github.com/yt-dlp/yt-dlp/issues/6446)) by [JChris246](https://github.com/JChris246) +- **pgatour**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3ae182ad89e1427ff7b1684d6a44ff93fa857a0c) ([#6613](https://github.com/yt-dlp/yt-dlp/issues/6613)) by [bashonly](https://github.com/bashonly) +- **playsuisse**: [Support new url format](https://github.com/yt-dlp/yt-dlp/commit/94627c5dde12a72766bdba36e056916c29c40ed1) ([#6528](https://github.com/yt-dlp/yt-dlp/issues/6528)) by [sbor23](https://github.com/sbor23) +- **polskieradio**: [Improve extractors](https://github.com/yt-dlp/yt-dlp/commit/738c90a463257634455ada3e5c18b714c531dede) ([#5948](https://github.com/yt-dlp/yt-dlp/issues/5948)) by [selfisekai](https://github.com/selfisekai) +- **pornez**: [Support new URL formats](https://github.com/yt-dlp/yt-dlp/commit/cbdf9408e6f1e35e98fd6477b3d6902df5b8a47f) ([#6792](https://github.com/yt-dlp/yt-dlp/issues/6792)) by [zhgwn](https://github.com/zhgwn) +- **pornhub**: [Set access cookies to fix extraction](https://github.com/yt-dlp/yt-dlp/commit/62beefa818c75c20b6941389bb197051554a5d41) ([#6685](https://github.com/yt-dlp/yt-dlp/issues/6685)) by [arobase-che](https://github.com/arobase-che), [Schmoaaaaah](https://github.com/Schmoaaaaah) +- **rai**: [Rewrite extractors](https://github.com/yt-dlp/yt-dlp/commit/c6d3f81a4077aaf9cffc6aa2d0dec92f38e74bb0) ([#5940](https://github.com/yt-dlp/yt-dlp/issues/5940)) by [danog](https://github.com/danog), [nixxo](https://github.com/nixxo) +- **recurbate**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c2502cfed91415c7ccfff925fd3404d230046484) ([#6297](https://github.com/yt-dlp/yt-dlp/issues/6297)) by [mrscrapy](https://github.com/mrscrapy) +- **reddit** + - [Add login support](https://github.com/yt-dlp/yt-dlp/commit/4d9280c9c853733534dda60486fa949bcca36c9e) ([#6950](https://github.com/yt-dlp/yt-dlp/issues/6950)) by [bashonly](https://github.com/bashonly) + - [Support cookies and short URLs](https://github.com/yt-dlp/yt-dlp/commit/7a6f6f24592a8065376f11a58e44878807732cf6) ([#6825](https://github.com/yt-dlp/yt-dlp/issues/6825)) by [bashonly](https://github.com/bashonly) +- **rokfin**: [Re-construct manifest url](https://github.com/yt-dlp/yt-dlp/commit/7a6c8a0807941dd24fbf0d6172e811884f98e027) ([#6507](https://github.com/yt-dlp/yt-dlp/issues/6507)) by [vampirefrog](https://github.com/vampirefrog) +- **rottentomatoes**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2d306c03d6f2697fcbabb7da35aa62cc078359d3) ([#6844](https://github.com/yt-dlp/yt-dlp/issues/6844)) by [JChris246](https://github.com/JChris246) +- **rozhlas** + - [Extract manifest formats](https://github.com/yt-dlp/yt-dlp/commit/e4cf7741f9302b3faa092962f2895b55cb3d89bb) ([#6590](https://github.com/yt-dlp/yt-dlp/issues/6590)) by [bashonly](https://github.com/bashonly) + - `MujRozhlas`: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c2b801fea59628d5c873e06a0727fbf2051bbd1f) ([#7129](https://github.com/yt-dlp/yt-dlp/issues/7129)) by [stanoarn](https://github.com/stanoarn) +- **rtvc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/9b30cd3dfce83c2f0201b28a7a3ef44ab9722664) ([#6578](https://github.com/yt-dlp/yt-dlp/issues/6578)) by [elyse0](https://github.com/elyse0) +- **rumble** + - [Detect timeline format](https://github.com/yt-dlp/yt-dlp/commit/78bc1868ff3352108ab2911033d1ac67a55f151e) by [pukkandan](https://github.com/pukkandan) + - [Fix videos without quality selection](https://github.com/yt-dlp/yt-dlp/commit/6994afc030d2a786d8032075ed71a14d7eac5a4f) by [pukkandan](https://github.com/pukkandan) +- **sbs**: [Overhaul extractor for new API](https://github.com/yt-dlp/yt-dlp/commit/6a765f135ccb654861336ea27a2c1c24ea8e286f) ([#6839](https://github.com/yt-dlp/yt-dlp/issues/6839)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf), [vidiot720](https://github.com/vidiot720) +- **shemaroome**: [Pass `stream_key` header to downloader](https://github.com/yt-dlp/yt-dlp/commit/7bc92517463f5766e9d9b92c3823b5cf403c0e3d) ([#7224](https://github.com/yt-dlp/yt-dlp/issues/7224)) by [bashonly](https://github.com/bashonly) +- **sonyliv**: [Fix login with token](https://github.com/yt-dlp/yt-dlp/commit/4815d35c191e7d375b94492a6486dd2ba43a8954) ([#7223](https://github.com/yt-dlp/yt-dlp/issues/7223)) by [bashonly](https://github.com/bashonly) +- **stageplus**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e5265dc6517478e589ee3c1ff0cb19bdf4e35ce1) ([#6838](https://github.com/yt-dlp/yt-dlp/issues/6838)) by [bashonly](https://github.com/bashonly) +- **stripchat**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f9213f8a2d7ba46b912afe1dd3ce6bb700a33d72) ([#7306](https://github.com/yt-dlp/yt-dlp/issues/7306)) by [foreignBlade](https://github.com/foreignBlade) +- **substack**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/12037d8b0a578fcc78a5c8f98964e48ee6060e25) ([#7218](https://github.com/yt-dlp/yt-dlp/issues/7218)) by [bashonly](https://github.com/bashonly) +- **sverigesradio**: [Support slug URLs](https://github.com/yt-dlp/yt-dlp/commit/5ee9a7d6e18ceea956e831994cf11c423979354f) ([#7220](https://github.com/yt-dlp/yt-dlp/issues/7220)) by [bashonly](https://github.com/bashonly) +- **tagesschau**: [Fix single audio urls](https://github.com/yt-dlp/yt-dlp/commit/af7585c824a1e405bd8afa46d87b4be322edc93c) ([#6626](https://github.com/yt-dlp/yt-dlp/issues/6626)) by [flashdagger](https://github.com/flashdagger) +- **teamcoco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c459d45dd4d417fb80a52e1a04e607776a44baa4) ([#6437](https://github.com/yt-dlp/yt-dlp/issues/6437)) by [bashonly](https://github.com/bashonly) +- **telecaribe**: [Expand livestream support](https://github.com/yt-dlp/yt-dlp/commit/69b2f838d3d3e37dc17367ef64d978db1bea45cf) ([#6601](https://github.com/yt-dlp/yt-dlp/issues/6601)) by [bashonly](https://github.com/bashonly) +- **tencent**: [Fix fatal metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/971d901d129403e875a04dd92109507a03fbc070) ([#7219](https://github.com/yt-dlp/yt-dlp/issues/7219)) by [bashonly](https://github.com/bashonly) +- **thesun**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/0181b9a1b31db3fde943f7cd3fe9662f23bff292) ([#6522](https://github.com/yt-dlp/yt-dlp/issues/6522)) by [hatienl0i261299](https://github.com/hatienl0i261299) +- **tiktok** + - [Extract 1080p adaptive formats](https://github.com/yt-dlp/yt-dlp/commit/c2a1bdb00931969193f2a31ea27b9c66a07aaec2) ([#7228](https://github.com/yt-dlp/yt-dlp/issues/7228)) by [bashonly](https://github.com/bashonly) + - [Fix and improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/925936908a3c3ee0e508621db14696b9f6a8b563) ([#6777](https://github.com/yt-dlp/yt-dlp/issues/6777)) by [bashonly](https://github.com/bashonly) + - [Fix mp3 formats](https://github.com/yt-dlp/yt-dlp/commit/8ceb07e870424c219dced8f4348729553f05c5cc) ([#6615](https://github.com/yt-dlp/yt-dlp/issues/6615)) by [bashonly](https://github.com/bashonly) + - [Fix resolution extraction](https://github.com/yt-dlp/yt-dlp/commit/ab6057ec80aa75db6303b8206916d00c376c622c) ([#7237](https://github.com/yt-dlp/yt-dlp/issues/7237)) by [puc9](https://github.com/puc9) + - [Improve `TikTokLive` extractor](https://github.com/yt-dlp/yt-dlp/commit/216bcb66d7dce0762767d751dad10650cb57da9d) ([#6520](https://github.com/yt-dlp/yt-dlp/issues/6520)) by [bashonly](https://github.com/bashonly) +- **triller**: [Support short URLs, detect removed videos](https://github.com/yt-dlp/yt-dlp/commit/33b737bedf8383c0d00d4e1d06a5273dcdfdb756) ([#6636](https://github.com/yt-dlp/yt-dlp/issues/6636)) by [bashonly](https://github.com/bashonly) +- **tv4**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/125ffaa1737dd04716f2f6fbb0595ad3eb7a4b1c) ([#5649](https://github.com/yt-dlp/yt-dlp/issues/5649)) by [dirkf](https://github.com/dirkf), [TxI5](https://github.com/TxI5) +- **tvp**: [Use new API](https://github.com/yt-dlp/yt-dlp/commit/0c7ce146e4d2a84e656d78f6857952bfd25ab389) ([#6989](https://github.com/yt-dlp/yt-dlp/issues/6989)) by [selfisekai](https://github.com/selfisekai) +- **tvplay**: [Remove outdated domains](https://github.com/yt-dlp/yt-dlp/commit/937264419f9bf375d5656785ae6e53282587c15d) ([#7106](https://github.com/yt-dlp/yt-dlp/issues/7106)) by [ivanskodje](https://github.com/ivanskodje) +- **twitch** + - [Extract original size thumbnail](https://github.com/yt-dlp/yt-dlp/commit/80b732b7a9585b2a61e456dc0d2d014a439cbaee) ([#6629](https://github.com/yt-dlp/yt-dlp/issues/6629)) by [JC-Chung](https://github.com/JC-Chung) + - [Fix `is_live`](https://github.com/yt-dlp/yt-dlp/commit/0551511b45f7847f40e4314aa9e624e80d086539) ([#6500](https://github.com/yt-dlp/yt-dlp/issues/6500)) by [elyse0](https://github.com/elyse0) + - [Support mobile clips](https://github.com/yt-dlp/yt-dlp/commit/02312c03cf53eb1da24c9ad022ee79af26060733) ([#6699](https://github.com/yt-dlp/yt-dlp/issues/6699)) by [bepvte](https://github.com/bepvte) + - [Update `_CLIENT_ID` and add extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/01231feb142e80828985aabdec04ac608e3d43e2) ([#7200](https://github.com/yt-dlp/yt-dlp/issues/7200)) by [bashonly](https://github.com/bashonly) + - vod: [Support links from schedule tab](https://github.com/yt-dlp/yt-dlp/commit/dbce5afa6bb61f6272ade613f2e9a3d66b88c7ea) ([#7071](https://github.com/yt-dlp/yt-dlp/issues/7071)) by [falbrechtskirchinger](https://github.com/falbrechtskirchinger) +- **twitter** + - [Add login support](https://github.com/yt-dlp/yt-dlp/commit/d1795f4a6af99c976c9d3ea2dabe5cf4f8965d3c) ([#7258](https://github.com/yt-dlp/yt-dlp/issues/7258)) by [bashonly](https://github.com/bashonly) + - [Default to GraphQL, handle auth errors](https://github.com/yt-dlp/yt-dlp/commit/147e62fc584c3ea6fdb09bb7a47905df68553a22) ([#6957](https://github.com/yt-dlp/yt-dlp/issues/6957)) by [bashonly](https://github.com/bashonly) + - spaces: [Add `release_timestamp`](https://github.com/yt-dlp/yt-dlp/commit/1c16d9df5330819cc79ad588b24aa5b72765c168) ([#7186](https://github.com/yt-dlp/yt-dlp/issues/7186)) by [CeruleanSky](https://github.com/CeruleanSky) +- **urplay**: [Extract all subtitles](https://github.com/yt-dlp/yt-dlp/commit/7bcd4813215ac98daa4949af2ffc677c78307a38) ([#7309](https://github.com/yt-dlp/yt-dlp/issues/7309)) by [hoaluvn](https://github.com/hoaluvn) +- **voot**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4f7b11cc1c1cebf598107e00cd7295588ed484da) ([#7227](https://github.com/yt-dlp/yt-dlp/issues/7227)) by [bashonly](https://github.com/bashonly) +- **vrt**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/1a7dcca378e80a387923ee05c250d8ba122441c6) ([#6244](https://github.com/yt-dlp/yt-dlp/issues/6244)) by [bashonly](https://github.com/bashonly), [bergoid](https://github.com/bergoid), [jeroenj](https://github.com/jeroenj) +- **weverse**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b844a3f8b16500663e7ab6c6ec061cc9b30f71ac) ([#6711](https://github.com/yt-dlp/yt-dlp/issues/6711)) by [bashonly](https://github.com/bashonly) (With fixes in [fd5d93f](https://github.com/yt-dlp/yt-dlp/commit/fd5d93f7040f9776fd541f4e4079dad7d3b3fb4f)) +- **wevidi**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1ea15603d852971ed7d92f4de12808b27b3d9370) ([#6868](https://github.com/yt-dlp/yt-dlp/issues/6868)) by [truedread](https://github.com/truedread) +- **weyyak**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6dc00acf0f1f1107a626c21befd1691403e6aeeb) ([#7124](https://github.com/yt-dlp/yt-dlp/issues/7124)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **whyp**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2c566ed14101673c651c08c306c30fa5b4010b85) ([#6803](https://github.com/yt-dlp/yt-dlp/issues/6803)) by [CoryTibbettsDev](https://github.com/CoryTibbettsDev) +- **wrestleuniverse** + - [Fix cookies support](https://github.com/yt-dlp/yt-dlp/commit/c8561c6d03f025268d6d3972abeb47987c8d7cbb) by [bashonly](https://github.com/bashonly) + - [Fix extraction, add login](https://github.com/yt-dlp/yt-dlp/commit/ef8fb7f029b816dfc95600727d84400591a3b5c5) ([#6982](https://github.com/yt-dlp/yt-dlp/issues/6982)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **wykop**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/aed945e1b9b7d3af2a907e1a12e6508cc81d6a20) ([#6140](https://github.com/yt-dlp/yt-dlp/issues/6140)) by [selfisekai](https://github.com/selfisekai) +- **ximalaya**: [Sort playlist entries](https://github.com/yt-dlp/yt-dlp/commit/8790ea7b2536332777bce68590386b1aa935fac7) ([#7292](https://github.com/yt-dlp/yt-dlp/issues/7292)) by [linsui](https://github.com/linsui) +- **YahooGyaOIE, YahooGyaOPlayerIE**: [Delete extractors due to website close](https://github.com/yt-dlp/yt-dlp/commit/68be95bd0ca3f76aa63c9812935bd826b3a42e53) ([#6218](https://github.com/yt-dlp/yt-dlp/issues/6218)) by [Lesmiscore](https://github.com/Lesmiscore) +- **yappy**: YappyProfile: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6f69101dc912690338d32e2aab085c32e44eba3f) ([#7346](https://github.com/yt-dlp/yt-dlp/issues/7346)) by [7vlad7](https://github.com/7vlad7) +- **youku**: [Improve error message](https://github.com/yt-dlp/yt-dlp/commit/ef0848abd425dfda6db62baa8d72897eefb0007f) ([#6690](https://github.com/yt-dlp/yt-dlp/issues/6690)) by [carusocr](https://github.com/carusocr) +- **youporn**: [Extract m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/ddae33754ae1f32dd9c64cf895c47d20f6b5f336) by [pukkandan](https://github.com/pukkandan) +- **youtube** + - [Add client name to `format_note` when `-v`](https://github.com/yt-dlp/yt-dlp/commit/c795c39f27244cbce846067891827e4847036441) ([#6254](https://github.com/yt-dlp/yt-dlp/issues/6254)) by [Lesmiscore](https://github.com/Lesmiscore), [pukkandan](https://github.com/pukkandan) + - [Add extractor-arg `include_duplicate_formats`](https://github.com/yt-dlp/yt-dlp/commit/86cb922118b236306310a72657f70426c20e28bb) by [pukkandan](https://github.com/pukkandan) + - [Bypass throttling for `-f17`](https://github.com/yt-dlp/yt-dlp/commit/c9abebb851e6188cb34b9eb744c1863dd46af919) by [pukkandan](https://github.com/pukkandan) + - [Construct fragment list lazily](https://github.com/yt-dlp/yt-dlp/commit/2a23d92d9ec44a0168079e38bcf3d383e5c4c7bb) by [pukkandan](https://github.com/pukkandan) (With fixes in [e389d17](https://github.com/yt-dlp/yt-dlp/commit/e389d172b6f42e4f332ae679dc48543fb7b9b61d)) + - [Define strict uploader metadata mapping](https://github.com/yt-dlp/yt-dlp/commit/7666b93604b97e9ada981c6b04ccf5605dd1bd44) ([#6384](https://github.com/yt-dlp/yt-dlp/issues/6384)) by [coletdjnz](https://github.com/coletdjnz) + - [Determine audio language using automatic captions](https://github.com/yt-dlp/yt-dlp/commit/ff9b0e071ffae5543cc309e6f9e647ac51e5846e) by [pukkandan](https://github.com/pukkandan) + - [Extract `channel_is_verified`](https://github.com/yt-dlp/yt-dlp/commit/8213ce28a485e200f6a7e1af1434a987c8e702bd) ([#7213](https://github.com/yt-dlp/yt-dlp/issues/7213)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract `heatmap` data](https://github.com/yt-dlp/yt-dlp/commit/5caf30dbc34f10b0be60676fece635b5c59f0d72) ([#7100](https://github.com/yt-dlp/yt-dlp/issues/7100)) by [tntmod54321](https://github.com/tntmod54321) + - [Extract more metadata for comments](https://github.com/yt-dlp/yt-dlp/commit/c35448b7b14113b35c4415dbfbf488c4731f006f) ([#7179](https://github.com/yt-dlp/yt-dlp/issues/7179)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract uploader metadata for feed/playlist items](https://github.com/yt-dlp/yt-dlp/commit/93e12ed76ef49252dc6869b59d21d0777e5e11af) by [coletdjnz](https://github.com/coletdjnz) + - [Fix comment loop detection for pinned comments](https://github.com/yt-dlp/yt-dlp/commit/141a8dff98874a426d7fbe772e0a8421bb42656f) ([#6714](https://github.com/yt-dlp/yt-dlp/issues/6714)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix continuation loop with no comments](https://github.com/yt-dlp/yt-dlp/commit/18f8fba7c89a87f99cc3313a1795848867e84fff) ([#7148](https://github.com/yt-dlp/yt-dlp/issues/7148)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix parsing `comment_count`](https://github.com/yt-dlp/yt-dlp/commit/071670cbeaa01ddf2cc20a95ae6da25f8f086431) ([#6523](https://github.com/yt-dlp/yt-dlp/issues/6523)) by [nick-cd](https://github.com/nick-cd) + - [Handle incomplete initial data from watch page](https://github.com/yt-dlp/yt-dlp/commit/607510b9f2f67bfe7d33d74031a5c1fe22a24862) ([#6510](https://github.com/yt-dlp/yt-dlp/issues/6510)) by [coletdjnz](https://github.com/coletdjnz) + - [Ignore wrong fps of some formats](https://github.com/yt-dlp/yt-dlp/commit/97afb093d4cbe5df889145afa5f9ede4535e93e4) by [pukkandan](https://github.com/pukkandan) + - [Misc cleanup](https://github.com/yt-dlp/yt-dlp/commit/14a14335b280766fbf5a469ae26836d6c1fe450a) by [coletdjnz](https://github.com/coletdjnz) + - [Prioritize premium formats](https://github.com/yt-dlp/yt-dlp/commit/51a07b0dca4c079d58311c19b6d1c097c24bb021) by [pukkandan](https://github.com/pukkandan) + - [Revert default formats to `https`](https://github.com/yt-dlp/yt-dlp/commit/c6786ff3baaf72a5baa4d56d34058e54cbcf8ceb) by [pukkandan](https://github.com/pukkandan) + - [Support podcasts and releases tabs](https://github.com/yt-dlp/yt-dlp/commit/447afb9eaa65bc677e3245c83e53a8e69c174a3c) by [coletdjnz](https://github.com/coletdjnz) + - [Support shorter relative time format](https://github.com/yt-dlp/yt-dlp/commit/2fb35f6004c7625f0dd493da4a5abf0690f7777c) ([#7191](https://github.com/yt-dlp/yt-dlp/issues/7191)) by [coletdjnz](https://github.com/coletdjnz) + - music_search_url: [Extract title](https://github.com/yt-dlp/yt-dlp/commit/69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2) ([#7102](https://github.com/yt-dlp/yt-dlp/issues/7102)) by [kangalio](https://github.com/kangalio) +- **zaiko** + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/345b4c0aedd9d19898ce00d5cef35fe0d277a052) ([#7254](https://github.com/yt-dlp/yt-dlp/issues/7254)) by [c-basalt](https://github.com/c-basalt) + - ZaikoETicket: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5cc09c004bd5edbbada9b041c08a720cadc4f4df) ([#7347](https://github.com/yt-dlp/yt-dlp/issues/7347)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **zdf**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/ee0ed0338df328cd986f97315c8162b5a151476d) by [bashonly](https://github.com/bashonly) +- **zee5**: [Fix extraction of new content](https://github.com/yt-dlp/yt-dlp/commit/9d7fde89a40360396f0baa2ee8bf507f92108b32) ([#7280](https://github.com/yt-dlp/yt-dlp/issues/7280)) by [bashonly](https://github.com/bashonly) +- **zingmp3**: [Fix and improve extractors](https://github.com/yt-dlp/yt-dlp/commit/17d7ca84ea723c20668bd9bfa938be7ea0e64f6b) ([#6367](https://github.com/yt-dlp/yt-dlp/issues/6367)) by [hatienl0i261299](https://github.com/hatienl0i261299) +- **zoom** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/79c77e85b70ae3b9942d5a88c14d021a9bd24222) ([#6741](https://github.com/yt-dlp/yt-dlp/issues/6741)) by [shreyasminocha](https://github.com/shreyasminocha) + - [Fix share URL extraction](https://github.com/yt-dlp/yt-dlp/commit/90c1f5120694105496a6ad9e3ecfc6c25de6cae1) ([#6789](https://github.com/yt-dlp/yt-dlp/issues/6789)) by [bashonly](https://github.com/bashonly) + +#### Downloader changes +- **curl**: [Fix progress reporting](https://github.com/yt-dlp/yt-dlp/commit/66aeaac9aa30b5959069ba84e53a5508232deb38) by [pukkandan](https://github.com/pukkandan) +- **fragment**: [Do not sleep between fragments](https://github.com/yt-dlp/yt-dlp/commit/424f3bf03305088df6e01d62f7311be8601ad3f4) by [pukkandan](https://github.com/pukkandan) + +#### Postprocessor changes +- [Fix chapters if duration is not extracted](https://github.com/yt-dlp/yt-dlp/commit/01ddec7e661bf90dc4c34e6924eb9d7629886cef) ([#6037](https://github.com/yt-dlp/yt-dlp/issues/6037)) by [bashonly](https://github.com/bashonly) +- [Print newline for `--progress-template`](https://github.com/yt-dlp/yt-dlp/commit/13ff78095372fd98900a32572cf817994c07ccb5) by [pukkandan](https://github.com/pukkandan) +- **EmbedThumbnail, FFmpegMetadata**: [Fix error on attaching thumbnails and info json for mkv/mka](https://github.com/yt-dlp/yt-dlp/commit/0f0875ed555514f32522a0f30554fb08825d5124) ([#6647](https://github.com/yt-dlp/yt-dlp/issues/6647)) by [Lesmiscore](https://github.com/Lesmiscore) +- **FFmpegFixupM3u8PP**: [Check audio codec before fixup](https://github.com/yt-dlp/yt-dlp/commit/3f7e2bd80e3c5d8a1682f20a1b245fcd974f295d) ([#6778](https://github.com/yt-dlp/yt-dlp/issues/6778)) by [bashonly](https://github.com/bashonly) +- **FixupDuplicateMoov**: [Fix bug in triggering](https://github.com/yt-dlp/yt-dlp/commit/26010b5cec50193b98ad7845d1d77450f9f14c2b) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- [Add automatic duplicate issue detection](https://github.com/yt-dlp/yt-dlp/commit/15b2d3db1d40b0437fca79d8874d392aa54b3cdd) by [pukkandan](https://github.com/pukkandan) +- **build** + - [Fix macOS target](https://github.com/yt-dlp/yt-dlp/commit/44a79958f0b596ee71e1eb25f158610aada29d1b) by [Grub4K](https://github.com/Grub4K) + - [Implement build verification using `--update-to`](https://github.com/yt-dlp/yt-dlp/commit/b73193c99aa23b135732408a5fcf655c68d731c6) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Pin `pyinstaller` version for MacOS](https://github.com/yt-dlp/yt-dlp/commit/427a8fafbb0e18c28d0ed7960be838d7b26b88d3) by [pukkandan](https://github.com/pukkandan) + - [Various build workflow improvements](https://github.com/yt-dlp/yt-dlp/commit/c4efa0aefec8daef1de62fd1693f13edf3c8b03c) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **cleanup** + - Miscellaneous + - [6f2287c](https://github.com/yt-dlp/yt-dlp/commit/6f2287cb18cbfb27518f068d868fa9390fee78ad) by [pukkandan](https://github.com/pukkandan) + - [ad54c91](https://github.com/yt-dlp/yt-dlp/commit/ad54c9130e793ce433bf9da334fa80df9f3aee58) by [freezboltz](https://github.com/freezboltz), [mikf](https://github.com/mikf), [pukkandan](https://github.com/pukkandan) +- **cleanup, utils**: [Split into submodules](https://github.com/yt-dlp/yt-dlp/commit/69bec6730ec9d724bcedeab199d9d684d61423ba) ([#7090](https://github.com/yt-dlp/yt-dlp/issues/7090)) by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +- **cli_to_api**: [Add script](https://github.com/yt-dlp/yt-dlp/commit/46f1370e9af6f8af8762f67e27e5acb8f0c48a47) by [pukkandan](https://github.com/pukkandan) +- **devscripts**: `make_changelog`: [Various improvements](https://github.com/yt-dlp/yt-dlp/commit/23c39a4beadee382060bb47fdaa21316ca707d38) by [Grub4K](https://github.com/Grub4K) +- **docs**: [Misc improvements](https://github.com/yt-dlp/yt-dlp/commit/c8bc203fbf3bb09914e53f0833eed622ab7edbb9) by [pukkandan](https://github.com/pukkandan) + ### 2023.03.04 #### Extractor changes diff --git a/README.md b/README.md index d9a5e6cefc..2f1fd9a0d8 100644 --- a/README.md +++ b/README.md @@ -910,7 +910,7 @@ ## Authentication Options: either the path or its containing directory. Defaults to ~/.netrc --netrc-cmd NETRC_CMD Command to execute to get the credentials - credentials for an extractor. + for an extractor. --video-password PASSWORD Video password (vimeo, youku) --ap-mso MSO Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for diff --git a/supportedsites.md b/supportedsites.md index f5c8c38295..882b272aab 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -150,7 +150,9 @@ # Supported sites - **bfmtv** - **bfmtv:article** - **bfmtv:live** - - **BibelTV** + - **bibeltv:live**: BibelTV live program + - **bibeltv:series**: BibelTV series playlist + - **bibeltv:video**: BibelTV single video - **Bigflix** - **Bigo** - **Bild**: Bild.de @@ -183,12 +185,17 @@ # Supported sites - **Bloomberg** - **BokeCC** - **BongaCams** - - **BooyahClips** - **BostonGlobe** - **Box** - **BoxCastVideo** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk + - **BrainPOP**: [*brainpop*](## "netrc machine") + - **BrainPOPELL**: [*brainpop*](## "netrc machine") + - **BrainPOPEsp**: [*brainpop*](## "netrc machine") BrainPOP Español + - **BrainPOPFr**: [*brainpop*](## "netrc machine") BrainPOP Français + - **BrainPOPIl**: [*brainpop*](## "netrc machine") BrainPOP Hebrew + - **BrainPOPJr**: [*brainpop*](## "netrc machine") - **BravoTV** - **Break** - **BreitBart** @@ -207,6 +214,8 @@ # Supported sites - **CAM4** - **Camdemy** - **CamdemyFolder** + - **CamFMEpisode** + - **CamFMShow** - **CamModels** - **Camsoda** - **CamtasiaEmbed** @@ -214,8 +223,6 @@ # Supported sites - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr - - **Canvas** - - **CanvasEen**: canvas.be and een.be - **CarambaTV** - **CarambaTVPage** - **CartoonNetwork** @@ -225,8 +232,10 @@ # Supported sites - **CBSInteractive** - **CBSLocal** - **CBSLocalArticle** + - **CBSLocalLive** - **cbsnews**: CBS News - **cbsnews:embed** + - **cbsnews:live**: CBS News Livestream - **cbsnews:livevideo**: CBS News Live Videos - **cbssports** - **cbssports:embed** @@ -252,6 +261,7 @@ # Supported sites - **CiscoLiveSession** - **ciscowebex**: Cisco Webex - **CJSW** + - **Clipchamp** - **cliphunter** - **Clippit** - **ClipRs** @@ -271,6 +281,7 @@ # Supported sites - **CNNIndonesia** - **ComedyCentral** - **ComedyCentralTV** + - **ConanClassic** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** - **CookingChannel** @@ -286,7 +297,10 @@ # Supported sites - **CrooksAndLiars** - **CrowdBunker** - **CrowdBunkerChannel** + - **Crtvg** - **crunchyroll**: [*crunchyroll*](## "netrc machine") + - **crunchyroll:artist**: [*crunchyroll*](## "netrc machine") + - **crunchyroll:music**: [*crunchyroll*](## "netrc machine") - **crunchyroll:playlist**: [*crunchyroll*](## "netrc machine") - **CSpan**: C-SPAN - **CSpanCongress** @@ -301,6 +315,8 @@ # Supported sites - **CWTV** - **Cybrary**: [*cybrary*](## "netrc machine") - **CybraryCourse**: [*cybrary*](## "netrc machine") + - **DacastPlaylist** + - **DacastVOD** - **Daftsex** - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** @@ -331,6 +347,7 @@ # Supported sites - **DigitalConcertHall**: [*digitalconcerthall*](## "netrc machine") DigitalConcertHall extractor - **DigitallySpeaking** - **Digiteka** + - **DiscogsReleasePlaylist** - **Discovery** - **DiscoveryLife** - **DiscoveryNetworksDe** @@ -341,6 +358,8 @@ # Supported sites - **DiscoveryPlusItalyShow** - **Disney** - **DIYNetwork** + - **dlf** + - **dlf:corpus**: DLF Multi-feed Archives - **dlive:stream** - **dlive:vod** - **Dotsub** @@ -378,6 +397,7 @@ # Supported sites - **EinsUndEinsTVRecordings**: [*1und1tv*](## "netrc machine") - **Einthusan** - **eitb.tv** + - **ElevenSports** - **EllenTube** - **EllenTubePlaylist** - **EllenTubeVideo** @@ -400,6 +420,7 @@ # Supported sites - **ESPNArticle** - **ESPNCricInfo** - **EsriVideo** + - **EttuTv** - **Europa** - **EuroParlWebstream** - **EuropeanTour** @@ -460,6 +481,7 @@ # Supported sites - **funimation:page**: [*funimation*](## "netrc machine") - **funimation:show**: [*funimation*](## "netrc machine") - **Funk** + - **Funker530** - **Fusion** - **Fux** - **FuyinTV** @@ -493,10 +515,16 @@ # Supported sites - **GlattvisionTVLive**: [*glattvisiontv*](## "netrc machine") - **GlattvisionTVRecordings**: [*glattvisiontv*](## "netrc machine") - **Glide**: Glide mobile video messages (glide.me) + - **GlobalPlayerAudio** + - **GlobalPlayerAudioEpisode** + - **GlobalPlayerLive** + - **GlobalPlayerLivePlaylist** + - **GlobalPlayerVideo** - **Globo**: [*globo*](## "netrc machine") - **GloboArticle** - **glomex**: Glomex videos - **glomex:embed**: Glomex embedded videos + - **GMANetworkVideo** - **Go** - **GoDiscovery** - **GodTube** @@ -522,7 +550,6 @@ # Supported sites - **Heise** - **HellPorno** - **Helsinki**: helsinki.fi - - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** - **HGTVDe** @@ -535,6 +562,8 @@ # Supported sites - **hitbox:live** - **HitRecord** - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau + - **HollywoodReporter** + - **HollywoodReporterPlaylist** - **Holodex** - **HotNewHipHop** - **hotstar** @@ -558,6 +587,7 @@ # Supported sites - **Hypem** - **Hytale** - **Icareus** + - **IdolPlus** - **iflix:episode** - **IflixSeries** - **ign.com** @@ -600,9 +630,9 @@ # Supported sites - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV - **IVXPlayer** - - **Iwara** - - **iwara:playlist** - - **iwara:user** + - **iwara**: [*iwara*](## "netrc machine") + - **iwara:playlist**: [*iwara*](## "netrc machine") + - **iwara:user**: [*iwara*](## "netrc machine") - **Ixigua** - **Izlesene** - **Jable** @@ -612,6 +642,7 @@ # Supported sites - **JeuxVideo** - **Joj** - **Jove** + - **JStream** - **JWPlatform** - **Kakao** - **Kaltura** @@ -678,8 +709,6 @@ # Supported sites - **limelight** - **limelight:channel** - **limelight:channel_list** - - **LineLive** - - **LineLiveChannel** - **LinkedIn**: [*linkedin*](## "netrc machine") - **linkedin:learning**: [*linkedin*](## "netrc machine") - **linkedin:​learning:course**: [*linkedin*](## "netrc machine") @@ -806,6 +835,7 @@ # Supported sites - **mtvservices:embedded** - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv + - **MujRozhlas** - **Murrtube** - **MurrtubeUser**: Murrtube user profile - **MuseScore** @@ -827,6 +857,7 @@ # Supported sites - **MyVideoGe** - **MyVidster** - **MyviEmbed** + - **Mzaalo** - **n-tv.de** - **N1Info:article** - **N1InfoAsset** @@ -858,6 +889,7 @@ # Supported sites - **Nebula**: [*watchnebula*](## "netrc machine") - **nebula:channel**: [*watchnebula*](## "netrc machine") - **nebula:subscriptions**: [*watchnebula*](## "netrc machine") + - **NekoHacker** - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 @@ -893,6 +925,9 @@ # Supported sites - **NhkForSchoolBangumi** - **NhkForSchoolProgramList** - **NhkForSchoolSubject**: Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学) + - **NhkRadioNewsPage** + - **NhkRadiru**: NHK らじる (Radiru/Rajiru) + - **NhkRadiruLive** - **NhkVod** - **NhkVodProgram** - **nhl.com** @@ -903,6 +938,7 @@ # Supported sites - **nicknight** - **niconico**: [*niconico*](## "netrc machine") ニコニコ動画 - **niconico:history**: NicoNico user history or likes. Requires cookies. + - **niconico:live**: ニコニコ生放送 - **niconico:playlist** - **niconico:series** - **niconico:tag**: NicoNico video tag URLs @@ -947,6 +983,7 @@ # Supported sites - **NRKTVSeries** - **NRLTV** - **ntv.ru** + - **NubilesPorn**: [*nubiles-porn*](## "netrc machine") - **Nuvid** - **NYTimes** - **NYTimesArticle** @@ -987,6 +1024,7 @@ # Supported sites - **OsnatelTVLive**: [*osnateltv*](## "netrc machine") - **OsnatelTVRecordings**: [*osnateltv*](## "netrc machine") - **OutsideTV** + - **OwnCloud** - **PacktPub**: [*packtpub*](## "netrc machine") - **PacktPubCourse** - **PalcoMP3:artist** @@ -999,6 +1037,7 @@ # Supported sites - **ParamountNetwork** - **ParamountPlus** - **ParamountPlusSeries** + - **ParamountPressExpress** - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview** @@ -1016,6 +1055,7 @@ # Supported sites - **PerformGroup** - **periscope**: Periscope - **periscope:user**: Periscope user videos + - **PGATour** - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** @@ -1057,7 +1097,6 @@ # Supported sites - **PolskieRadio** - **polskieradio:audition** - **polskieradio:category** - - **polskieradio:kierowcow** - **polskieradio:legacy** - **polskieradio:player** - **polskieradio:podcast** @@ -1122,6 +1161,7 @@ # Supported sites - **radlive:channel** - **radlive:season** - **Rai** + - **RaiCultura** - **RaiNews** - **RaiPlay** - **RaiPlayLive** @@ -1142,11 +1182,12 @@ # Supported sites - **RCTIPlusSeries** - **RCTIPlusTV** - **RDS**: RDS.ca + - **Recurbate** - **RedBull** - **RedBullEmbed** - **RedBullTV** - **RedBullTVRrnContent** - - **Reddit** + - **Reddit**: [*reddit*](## "netrc machine") - **RedGifs** - **RedGifsSearch**: Redgifs search - **RedGifsUser**: Redgifs user @@ -1186,6 +1227,9 @@ # Supported sites - **RTP** - **RTRFM** - **RTS**: RTS.ch + - **RTVCKaltura** + - **RTVCPlay** + - **RTVCPlayEmbed** - **rtve.es:alacarta**: RTVE a la carta - **rtve.es:audio**: RTVE audio - **rtve.es:infantil**: RTVE infantil @@ -1239,6 +1283,7 @@ # Supported sites - **SCTE**: [*scte*](## "netrc machine") - **SCTECourse**: [*scte*](## "netrc machine") - **Seeker** + - **SenalColombiaLive** - **SenateGov** - **SenateISVP** - **SendtoNews** @@ -1315,6 +1360,7 @@ # Supported sites - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites + - **StagePlusVODConcert**: [*stageplus*](## "netrc machine") - **stanfordoc**: Stanford Open ClassRoom - **StarTrek** - **startv** @@ -1427,6 +1473,7 @@ # Supported sites - **TrailerAddict**: (**Currently broken**) - **TravelChannel** - **Triller**: [*triller*](## "netrc machine") + - **TrillerShort** - **TrillerUser**: [*triller*](## "netrc machine") - **Trilulilu** - **Trovo** @@ -1499,12 +1546,12 @@ # Supported sites - **TwitchVideos**: [*twitch*](## "netrc machine") - **TwitchVideosClips**: [*twitch*](## "netrc machine") - **TwitchVideosCollections**: [*twitch*](## "netrc machine") - - **twitter** - - **twitter:amplify** - - **twitter:broadcast** + - **twitter**: [*twitter*](## "netrc machine") + - **twitter:amplify**: [*twitter*](## "netrc machine") + - **twitter:broadcast**: [*twitter*](## "netrc machine") - **twitter:card** - - **twitter:shortener** - - **twitter:spaces** + - **twitter:shortener**: [*twitter*](## "netrc machine") + - **twitter:spaces**: [*twitter*](## "netrc machine") - **Txxx** - **udemy**: [*udemy*](## "netrc machine") - **udemy:course**: [*udemy*](## "netrc machine") @@ -1541,7 +1588,6 @@ # Supported sites - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **vhx:embed**: [*vimeo*](## "netrc machine") - - **Viafree** - **vice** - **vice:article** - **vice:show** @@ -1607,8 +1653,8 @@ # Supported sites - **voicy** - **voicy:channel** - **VolejTV** - - **Voot** - - **VootSeries** + - **Voot**: [*voot*](## "netrc machine") + - **VootSeries**: [*voot*](## "netrc machine") - **VoxMedia** - **VoxMediaVolume** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl @@ -1616,7 +1662,7 @@ # Supported sites - **vqq:video** - **Vrak** - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - - **VrtNU**: [*vrtnu*](## "netrc machine") VrtNU.be + - **VrtNU**: [*vrtnu*](## "netrc machine") VRT MAX - **vrv**: [*vrv*](## "netrc machine") - **vrv:series** - **VShare** @@ -1660,7 +1706,16 @@ # Supported sites - **WeiqiTV**: WQTV - **wetv:episode** - **WeTvSeries** + - **Weverse**: [*weverse*](## "netrc machine") + - **WeverseLive**: [*weverse*](## "netrc machine") + - **WeverseLiveTab**: [*weverse*](## "netrc machine") + - **WeverseMedia**: [*weverse*](## "netrc machine") + - **WeverseMediaTab**: [*weverse*](## "netrc machine") + - **WeverseMoment**: [*weverse*](## "netrc machine") + - **WeVidi** + - **Weyyak** - **whowatch** + - **Whyp** - **wikimedia.org** - **Willow** - **WimTV** @@ -1674,13 +1729,17 @@ # Supported sites - **WorldStarHipHop** - **wppilot** - **wppilot:channels** - - **WrestleUniversePPV** - - **WrestleUniverseVOD** + - **WrestleUniversePPV**: [*wrestleuniverse*](## "netrc machine") + - **WrestleUniverseVOD**: [*wrestleuniverse*](## "netrc machine") - **WSJ**: Wall Street Journal - **WSJArticle** - **WWE** - **wyborcza:video** - **WyborczaPodcast** + - **wykop:dig** + - **wykop:​dig:comment** + - **wykop:post** + - **wykop:​post:comment** - **Xanimu** - **XBef** - **XboxClips** @@ -1701,8 +1760,6 @@ # Supported sites - **xvideos:quickies** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies - - **yahoo:gyao** - - **yahoo:​gyao:player** - **yahoo:japannews**: Yahoo! Japan News - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом @@ -1714,6 +1771,7 @@ # Supported sites - **YandexVideoPreview** - **YapFiles** - **Yappy** + - **YappyProfile** - **YesJapan** - **yinyuetai:video**: 音悦Tai - **YleAreena** @@ -1746,6 +1804,8 @@ # Supported sites - **youtube:watchlater**: Youtube watch later list; ":ytwatchlater" keyword (requires cookies) - **YoutubeLivestreamEmbed**: YouTube livestream embeds - **YoutubeYtBe**: youtu.be + - **Zaiko** + - **ZaikoETicket** - **Zapiks** - **Zattoo**: [*zattoo*](## "netrc machine") - **ZattooLive**: [*zattoo*](## "netrc machine") @@ -1763,6 +1823,7 @@ # Supported sites - **zingmp3:album** - **zingmp3:chart-home** - **zingmp3:chart-music-video** + - **zingmp3:hub** - **zingmp3:user** - **zingmp3:week-chart** - **zoom** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 04bece0387..f4474db9a5 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.03.04' +__version__ = '2023.06.21' -RELEASE_GIT_HEAD = '392389b7df7b818f794b231f14dc396d4875fbad' +RELEASE_GIT_HEAD = '42f2d40b475db66486a4b4fe5b56751a640db5db' VARIANT = None From d7cd97e8d8d42b500fea9abb2aa4ac9b0f98b2ad Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 12:12:15 +0530 Subject: [PATCH 212/501] Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb Closes #7367 --- README.md | 2 +- yt_dlp/extractor/common.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2f1fd9a0d8..a2bc33fbd6 100644 --- a/README.md +++ b/README.md @@ -1221,7 +1221,7 @@ ### Authentication with netrc The default location of the .netrc file is `~` (see below). -As an alternative to using the `.netrc` file, which has the disadvantage of keeping your passwords in a plain text file, you can configure a custom shell command to provide the credentials for an extractor. This is done by providing the `--netrc-cmd` parameter, it shall output the credentials in the netrc format and return `0` on success, other values will be treated as an error. `{}` in the command will be replaced by the name of the extractor to make it possible to select the credentials for the right extractor (To use literal braces, double them like `{{}}`). +As an alternative to using the `.netrc` file, which has the disadvantage of keeping your passwords in a plain text file, you can configure a custom shell command to provide the credentials for an extractor. This is done by providing the `--netrc-cmd` parameter, it shall output the credentials in the netrc format and return `0` on success, other values will be treated as an error. `{}` in the command will be replaced by the name of the extractor to make it possible to select the credentials for the right extractor. E.g. To use an encrypted `.netrc` file stored as `.authinfo.gpg` ``` diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 9662a7ee1c..2ea36c63da 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1297,8 +1297,9 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr def _get_netrc_login_info(self, netrc_machine=None): netrc_machine = netrc_machine or self._NETRC_MACHINE - cmd = self.get_param('netrc_cmd', '').format(netrc_machine) + cmd = self.get_param('netrc_cmd') if cmd: + cmd = cmd.replace('{}', netrc_machine) self.to_screen(f'Executing command: {cmd}') stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) if ret != 0: From db22142f6f817ff673d417b4b78e8db497bf8ab3 Mon Sep 17 00:00:00 2001 From: OverlordQ <overlordq@gmail.com> Date: Wed, 21 Jun 2023 03:17:07 -0400 Subject: [PATCH 213/501] [extractor/dropout] Fix season extraction (#7304) Authored by: OverlordQ --- yt_dlp/extractor/dropout.py | 54 ++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py index e280b1c9f4..80ae6c1268 100644 --- a/yt_dlp/extractor/dropout.py +++ b/yt_dlp/extractor/dropout.py @@ -1,13 +1,17 @@ +import functools + from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, + extract_attributes, get_element_by_class, get_element_by_id, - get_elements_by_class, + get_elements_html_by_class, int_or_none, - join_nonempty, + traverse_obj, unified_strdate, urlencode_postdata, ) @@ -162,12 +166,13 @@ def _real_extract(self, url): class DropoutSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _PAGE_SIZE = 24 + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:(?P<season>[0-9]+)/?$)' _TESTS = [ { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', 'note': 'Multi-season series with the season in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -176,7 +181,7 @@ class DropoutSeasonIE(InfoExtractor): { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', 'note': 'Multi-season series with the season not in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -190,29 +195,30 @@ class DropoutSeasonIE(InfoExtractor): 'id': 'dimension-20-shriek-week-season-1', 'title': 'Dimension 20 Shriek Week - Season 1' } + }, + { + 'url': 'https://www.dropout.tv/breaking-news-no-laugh-newsroom/season:3', + 'note': 'Multi-season series with season in the url that requires pagination', + 'playlist_count': 25, + 'info_dict': { + 'id': 'breaking-news-no-laugh-newsroom-season-3', + 'title': 'Breaking News No Laugh Newsroom - Season 3' + } } ] + def _fetch_page(self, url, season_id, page): + page += 1 + webpage = self._download_webpage( + f'{url}?page={page}', season_id, note=f'Downloading page {page}', expected_status={400}) + yield from [self.url_result(item_url, DropoutIE) for item_url in traverse_obj( + get_elements_html_by_class('browse-item-link', webpage), (..., {extract_attributes}, 'href'))] + def _real_extract(self, url): season_id = self._match_id(url) + season_num = self._match_valid_url(url).group('season') or 1 season_title = season_id.replace('-', ' ').title() - webpage = self._download_webpage(url, season_id) - entries = [ - self.url_result( - url=self._search_regex(r'<a href=["\'](.+?)["\'] class=["\']browse-item-link["\']', - item, 'item_url'), - ie=DropoutIE.ie_key() - ) for item in get_elements_by_class('js-collection-item', webpage) - ] - - seasons = (get_element_by_class('select-dropdown-wrapper', webpage) or '').strip().replace('\n', '') - current_season = self._search_regex(r'<option[^>]+selected>([^<]+)</option>', - seasons, 'current_season', default='').strip() - - return { - '_type': 'playlist', - 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), - 'title': join_nonempty(season_title, current_season, delim=' - '), - 'entries': entries - } + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, url, season_id), self._PAGE_SIZE), + f'{season_id}-season-{season_num}', f'{season_title} - Season {season_num}') From 7f9c6a63b16e145495479e9f666f5b9e2ee69e2f Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Wed, 21 Jun 2023 03:24:24 -0500 Subject: [PATCH 214/501] [cleanup] Misc Authored by: bashonly --- README.md | 8 ++++---- yt_dlp/extractor/twitch.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a2bc33fbd6..d9b7a9e5d4 100644 --- a/README.md +++ b/README.md @@ -1780,7 +1780,7 @@ # Do not set any "synopsis" in the video metadata $ yt-dlp --parse-metadata ":(?P<meta_synopsis>)" # Remove "formats" field from the infojson by setting it to an empty string -$ yt-dlp --parse-metadata ":(?P<formats>)" -j +$ yt-dlp --parse-metadata "video::(?P<formats>)" --write-info-json # Replace all spaces and "_" in title and uploader with a `-` $ yt-dlp --replace-in-metadata "title,uploader" "[ _]" "-" @@ -1854,11 +1854,11 @@ #### rokfinchannel #### twitter * `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed -### wrestleuniverse +#### wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage -#### twitchstream (Twitch) -* `client_id`: Client ID value to be sent with GraphQL requests, e.g. `twitchstream:client_id=kimne78kx3ncx6brgo4mv6wki5h1ko` +#### twitch +* `client_id`: Client ID value to be sent with GraphQL requests, e.g. `twitch:client_id=kimne78kx3ncx6brgo4mv6wki5h1ko` #### nhkradirulive (NHK らじる★らじる LIVE) * `area`: Which regional variation to extract. Valid areas are: `sapporo`, `sendai`, `tokyo`, `nagoya`, `osaka`, `hiroshima`, `matsuyama`, `fukuoka`. Defaults to `tokyo` diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 31b349bc68..c8ee520144 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -60,7 +60,7 @@ class TwitchBaseIE(InfoExtractor): @property def _CLIENT_ID(self): return self._configuration_arg( - 'client_id', ['ue6666qo983tsx6so1t0vnawi233wa'], ie_key=TwitchStreamIE, casesense=True)[0] + 'client_id', ['ue6666qo983tsx6so1t0vnawi233wa'], ie_key='Twitch', casesense=True)[0] def _perform_login(self, username, password): def fail(message): From 3f756c8c4095b942cf49788eb0862ceaf57847f2 Mon Sep 17 00:00:00 2001 From: Roland Hieber <rohieb@users.noreply.github.com> Date: Wed, 21 Jun 2023 10:29:34 +0200 Subject: [PATCH 215/501] [extractor/nebula] Fix extractor (#7156) Closes #7017 Authored by: Lamieur, rohieb Co-authored-by: Lam <github@Lam.pl> --- yt_dlp/extractor/nebula.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 5c1b7c7125..7a5a02dfa6 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -3,7 +3,7 @@ import urllib.error from .common import InfoExtractor -from ..utils import ExtractorError, parse_iso8601 +from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' @@ -65,19 +65,20 @@ def _fetch_nebula_bearer_token(self): return response['token'] def _fetch_video_formats(self, slug): - stream_info = self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/stream/', + stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/', video_id=slug, auth_type='bearer', note='Fetching video stream info') manifest_url = stream_info['manifest'] - return self._extract_m3u8_formats_and_subtitles(manifest_url, slug) + return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4') def _build_video_info(self, episode): fmts, subs = self._fetch_video_formats(episode['slug']) channel_slug = episode['channel_slug'] channel_title = episode['channel_title'] + zype_id = episode.get('zype_id') return { - 'id': episode['zype_id'], + 'id': remove_start(episode['id'], 'video_episode:'), 'display_id': episode['slug'], 'formats': fmts, 'subtitles': subs, @@ -99,6 +100,9 @@ def _build_video_info(self, episode): 'uploader_url': f'https://nebula.tv/{channel_slug}', 'series': channel_title, 'creator': channel_title, + 'extractor_key': NebulaIE.ie_key(), + 'extractor': NebulaIE.IE_NAME, + '_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None, } def _perform_login(self, username=None, password=None): @@ -113,7 +117,7 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'md5': '14944cfee8c7beeea106320c47560efc', 'info_dict': { - 'id': '5c271b40b13fd613090034fd', + 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', 'ext': 'mp4', 'title': 'That Time Disney Remade Beauty and the Beast', 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', @@ -137,22 +141,22 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'md5': 'd05739cf6c38c09322422f696b569c23', 'info_dict': { - 'id': '5e7e78171aaf320001fbd6be', + 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', 'ext': 'mp4', 'title': 'Landing Craft - How The Allies Got Ashore', 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'Real Engineering', - 'channel_id': 'realengineering', - 'uploader': 'Real Engineering', - 'uploader_id': 'realengineering', - 'series': 'Real Engineering', + 'channel': 'Real Engineering — The Logistics of D-Day', + 'channel_id': 'd-day', + 'uploader': 'Real Engineering — The Logistics of D-Day', + 'uploader_id': 'd-day', + 'series': 'Real Engineering — The Logistics of D-Day', 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'creator': 'Real Engineering', + 'creator': 'Real Engineering — The Logistics of D-Day', 'duration': 841, - 'channel_url': 'https://nebula.tv/realengineering', - 'uploader_url': 'https://nebula.tv/realengineering', + 'channel_url': 'https://nebula.tv/d-day', + 'uploader_url': 'https://nebula.tv/d-day', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, @@ -160,7 +164,7 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', 'md5': 'ebe28a7ad822b9ee172387d860487868', 'info_dict': { - 'id': '5e779ebdd157bc0001d1c75a', + 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', 'ext': 'mp4', 'title': 'Episode 1: The Draw', 'description': r'contains:There’s free money on offer… if the players can all work together.', @@ -190,7 +194,7 @@ class NebulaIE(NebulaBaseIE): ] def _fetch_video_metadata(self, slug): - return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/', video_id=slug, auth_type='bearer', note='Fetching video meta data') From a4486bfc1dc7057efca9dd3fe70d7fa25c56f700 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 21 Jun 2023 12:35:14 +0530 Subject: [PATCH 216/501] Revert "[misc] Add automatic duplicate issue detection" This reverts commit 15b2d3db1d40b0437fca79d8874d392aa54b3cdd. --- .github/workflows/potential-duplicates.yml | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 .github/workflows/potential-duplicates.yml diff --git a/.github/workflows/potential-duplicates.yml b/.github/workflows/potential-duplicates.yml deleted file mode 100644 index cfc5831864..0000000000 --- a/.github/workflows/potential-duplicates.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Potential Duplicates -on: - issues: - types: [opened, edited] - -jobs: - run: - runs-on: ubuntu-latest - steps: - - uses: wow-actions/potential-duplicates@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - label: potential-duplicate - state: all - threshold: 0.3 - comment: | - This issue is potentially a duplicate of one of the following issues: - {{#issues}} - - #{{ number }} ({{ accuracy }}%) - {{/issues}} From 81ca451480051d7ce1a31c017e005358345a9149 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 00:15:22 +0530 Subject: [PATCH 217/501] [extractor/youtube] Workaround 403 for android formats Ref: https://github.com/TeamNewPipe/NewPipe/issues/9038#issuecomment-1289756816 --- yt_dlp/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 11e47904a5..a77a626ac5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3599,7 +3599,7 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - _STORY_PLAYER_PARAMS = '8AEB' + _PLAYER_PARAMS = 'CgIQBg==' def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): @@ -3613,7 +3613,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, 'videoId': video_id, } if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': - yt_query['params'] = self._STORY_PLAYER_PARAMS + yt_query['params'] = self._PLAYER_PARAMS yt_query.update(self._generate_player_context(sts)) return self._extract_response( @@ -4011,8 +4011,8 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - if smuggled_data.get('is_story'): - query['pp'] = self._STORY_PLAYER_PARAMS + if smuggled_data.get('is_story'): # XXX: Deprecated + query['pp'] = self._PLAYER_PARAMS webpage = self._download_webpage( webpage_url, video_id, fatal=False, query=query) From 1e75d97db21152acc764b30a688e516f04b8a142 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 00:20:04 +0530 Subject: [PATCH 218/501] [extractor/youtube] Add `ios` to default clients used * IOS is affected neither by 403 or by nsig so helps mitigate them preemptively * IOS also has higher bit-rate "premium" formats though they are not labeled as such --- README.md | 2 +- yt_dlp/extractor/youtube.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d9b7a9e5d4..d108321038 100644 --- a/README.md +++ b/README.md @@ -1798,7 +1798,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a77a626ac5..a90118680f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -258,7 +258,7 @@ def build_innertube_clients(): THIRD_PARTY = { 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL } - BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb') + BASE_CLIENTS = ('ios', 'android', 'web', 'tv', 'mweb') priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): @@ -3625,7 +3625,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - default = ['android', 'web'] + default = ['ios', 'android', 'web'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) @@ -3932,6 +3932,10 @@ def process_manifest_format(f, proto, client_name, itag): elif itag: f['format_id'] = itag + if itag in ('616', '235'): + f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') + f['source_preference'] = (f.get('source_preference') or -1) + 100 + f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) From 0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 01:37:55 +0530 Subject: [PATCH 219/501] Indicate `filesize` approximated from `tbr` better --- yt_dlp/YoutubeDL.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index bc5c1b95ee..79b7d47b03 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2666,7 +2666,8 @@ def is_wellformed(f): format['dynamic_range'] = 'SDR' if format.get('aspect_ratio') is None: format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) - if (info_dict.get('duration') and format.get('tbr') + if (not format.get('manifest_url') # For fragmented formats, "tbr" is often max bitrate and not average + and info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict)) @@ -3707,8 +3708,11 @@ def simplified_codec(f, field): format_field(f, 'fps', '\t%d', func=round), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), format_field(f, 'audio_channels', '\t%s'), - delim, - format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), + delim, ( + format_field(f, 'filesize', ' \t%s', func=format_bytes) + or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes) + or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))), + None, self._format_out('~\t%s', self.Styles.SUPPRESS))), format_field(f, 'tbr', '\t%dk', func=round), shorten_protocol_name(f.get('protocol', '')), delim, From 5fd8367496b42c7b900b896a0d5460561a2859de Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 02:57:00 +0530 Subject: [PATCH 220/501] [extractor] Support multiple `_VALID_URL`s (#5812) Authored by: nixxo --- devscripts/lazy_load_template.py | 1 + yt_dlp/extractor/common.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index c8815e01bc..6f52165c5c 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -6,6 +6,7 @@ age_restricted, bug_reports_message, classproperty, + variadic, write_string, ) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 2ea36c63da..3f7dcb82bb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -475,8 +475,8 @@ class InfoExtractor: Subclasses of this should also be added to the list of extractors and - should define a _VALID_URL regexp and, re-define the _real_extract() and - (optionally) _real_initialize() methods. + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs @@ -566,8 +566,8 @@ def _match_valid_url(cls, url): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) @classmethod def suitable(cls, url): From f2ff0f6f1914b82d4a51681a72cc0828115dcb4a Mon Sep 17 00:00:00 2001 From: rexlambert22 <135362223+rexlambert22@users.noreply.github.com> Date: Wed, 21 Jun 2023 20:00:54 -0400 Subject: [PATCH 221/501] [extractor/motherless] Add gallery support, fix groups (#7211) Authored by: rexlambert22 --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/motherless.py | 223 ++++++++++++++++++-------------- 2 files changed, 125 insertions(+), 101 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ff659a7a29..15344a6e5a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1119,7 +1119,8 @@ from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, - MotherlessGroupIE + MotherlessGroupIE, + MotherlessGalleryIE, ) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index c24ef9b0d1..769b52ce6d 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -1,32 +1,39 @@ import datetime import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( ExtractorError, - InAdvancePagedList, - orderedSet, + OnDemandPagedList, + remove_end, str_to_int, unified_strdate, ) class MotherlessIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/|G[VIG]?[A-F0-9]+/)?(?P<id>[A-F0-9]+)' _TESTS = [{ - 'url': 'http://motherless.com/AC3FFE1', - 'md5': '310f62e325a9fafe64f68c0bccb6e75f', + 'url': 'http://motherless.com/EE97006', + 'md5': 'cb5e7438f7a3c4e886b7bccc1292a3bc', 'info_dict': { - 'id': 'AC3FFE1', + 'id': 'EE97006', 'ext': 'mp4', - 'title': 'Fucked in the ass while playing PS3', - 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], - 'upload_date': '20100913', - 'uploader_id': 'famouslyfuckedup', + 'title': 'Dogging blond Brit getting glazed (comp)', + 'categories': ['UK', 'slag', 'whore', 'dogging', 'cunt', 'cumhound', 'big tits', 'Pearl Necklace'], + 'upload_date': '20230519', + 'uploader_id': 'deathbird', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + 'params': { + # Incomplete cert chains + 'nocheckcertificate': True, + }, }, { 'url': 'http://motherless.com/532291B', 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', @@ -49,16 +56,36 @@ class MotherlessIE(InfoExtractor): 'id': '633979F', 'ext': 'mp4', 'title': 'Turtlette', - 'categories': ['superheroine heroine superher'], + 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + 'like_count': int, + 'comment_count': int, + 'view_count': int, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { - # no keywords 'url': 'http://motherless.com/8B4BBC1', - 'only_matching': True, + 'info_dict': { + 'id': '8B4BBC1', + 'ext': 'mp4', + 'title': 'VIDEO00441.mp4', + 'categories': [], + 'upload_date': '20160214', + 'uploader_id': 'NMWildGirl', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + 'like_count': int, + 'comment_count': int, + 'view_count': int, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { # see https://motherless.com/videos/recent for recent videos with # uploaded date in "ago" format @@ -72,9 +99,12 @@ class MotherlessIE(InfoExtractor): 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, + 'like_count': int, + 'comment_count': int, + 'view_count': int, }, 'params': { - 'skip_download': True, + 'nocheckcertificate': True, }, }] @@ -128,10 +158,8 @@ def _real_extract(self, url): (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id', fatal=False) - - categories = self._html_search_meta('keywords', webpage, default=None) - if categories: - categories = [cat.strip() for cat in categories.split(',')] + categories = self._html_search_meta('keywords', webpage, default='') + categories = [cat.strip() for cat in categories.split(',') if cat.strip()] return { 'id': video_id, @@ -148,102 +176,97 @@ def _real_extract(self, url): } -class MotherlessGroupIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' +class MotherlessPaginatedIE(InfoExtractor): + _PAGE_SIZE = 60 + + def _correct_path(self, url, item_id): + raise NotImplementedError('This method must be implemented by subclasses') + + def _extract_entries(self, webpage, base): + for mobj in re.finditer(r'href="[^"]*(?P<href>/[A-F0-9]+)"\s+title="(?P<title>[^"]+)', + webpage): + video_url = urllib.parse.urljoin(base, mobj.group('href')) + video_id = MotherlessIE.get_temp_id(video_url) + + if video_id: + yield self.url_result(video_url, MotherlessIE, video_id, mobj.group('title')) + + def _real_extract(self, url): + item_id = self._match_id(url) + real_url = self._correct_path(url, item_id) + webpage = self._download_webpage(real_url, item_id, 'Downloading page 1') + + def get_page(idx): + page = idx + 1 + current_page = webpage if not idx else self._download_webpage( + real_url, item_id, note=f'Downloading page {page}', query={'page': page}) + yield from self._extract_entries(current_page, real_url) + + return self.playlist_result( + OnDemandPagedList(get_page, self._PAGE_SIZE), item_id, + remove_end(self._html_extract_title(webpage), ' | MOTHERLESS.COM ™')) + + +class MotherlessGroupIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/g[vifm]?/(?P<id>[a-z0-9_]+)/?(?:$|[#?])' _TESTS = [{ - 'url': 'http://motherless.com/g/movie_scenes', + 'url': 'http://motherless.com/gv/movie_scenes', 'info_dict': { 'id': 'movie_scenes', 'title': 'Movie Scenes', - 'description': 'Hot and sexy scenes from "regular" movies... ' - 'Beautiful actresses fully nude... A looot of ' - 'skin! :)Enjoy!', }, - 'playlist_mincount': 662, + 'playlist_mincount': 540, }, { - 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'url': 'http://motherless.com/g/sex_must_be_funny', 'info_dict': { 'id': 'sex_must_be_funny', 'title': 'Sex must be funny', - 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' - 'any kind!' }, - 'playlist_mincount': 0, - 'expected_warnings': [ - 'This group has no videos.', - ] + 'playlist_count': 0, }, { - 'url': 'https://motherless.com/g/beautiful_cock', + 'url': 'https://motherless.com/gv/beautiful_cock', 'info_dict': { 'id': 'beautiful_cock', 'title': 'Beautiful Cock', - 'description': 'Group for lovely cocks yours, mine, a friends anything human', }, - 'playlist_mincount': 2500, + 'playlist_mincount': 2040, }] - @classmethod - def suitable(cls, url): - return (False if MotherlessIE.suitable(url) - else super(MotherlessGroupIE, cls).suitable(url)) + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/gv/{item_id}') - def _extract_entries(self, webpage, base): - entries = [] - for mobj in re.finditer( - r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', - webpage): - video_url = compat_urlparse.urljoin(base, mobj.group('href')) - if not MotherlessIE.suitable(video_url): - continue - video_id = MotherlessIE._match_id(video_url) - title = mobj.group('title') - entries.append(self.url_result( - video_url, ie=MotherlessIE.ie_key(), video_id=video_id, - video_title=title)) - # Alternative fallback - if not entries: - entries = [ - self.url_result( - compat_urlparse.urljoin(base, '/' + entry_id), - ie=MotherlessIE.ie_key(), video_id=entry_id) - for entry_id in orderedSet(re.findall( - r'data-codename=["\']([A-Z0-9]+)', webpage))] - return entries - def _real_extract(self, url): - group_id = self._match_id(url) - page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) - webpage = self._download_webpage(page_url, group_id) - title = self._search_regex( - r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) - description = self._html_search_meta( - 'description', webpage, fatal=False) - page_count = str_to_int(self._search_regex( - r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', - webpage, 'page_count', default=0)) - if not page_count: - message = self._search_regex( - r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', - webpage, 'error_msg', default=None) or 'This group has no videos.' - self.report_warning(message, group_id) - page_count = 1 - PAGE_SIZE = 80 +class MotherlessGalleryIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/G[VIG]?(?P<id>[A-F0-9]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://motherless.com/GV338999F', + 'info_dict': { + 'id': '338999F', + 'title': 'Random', + }, + 'playlist_mincount': 190, + }, { + 'url': 'https://motherless.com/GVABD6213', + 'info_dict': { + 'id': 'ABD6213', + 'title': 'Cuties', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://motherless.com/GVBCF7622', + 'info_dict': { + 'id': 'BCF7622', + 'title': 'Vintage', + }, + 'playlist_count': 0, + }, { + 'url': 'https://motherless.com/G035DE2F', + 'info_dict': { + 'id': '035DE2F', + 'title': 'General', + }, + 'playlist_mincount': 420, + }] - def _get_page(idx): - if idx > 0: - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) - for entry in self._extract_entries(webpage, url): - yield entry - - playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) - - return { - '_type': 'playlist', - 'id': group_id, - 'title': title, - 'description': description, - 'entries': playlist - } + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/GV{item_id}') From 774aa09dd6aa61ced9ec818d1f67e53414d22762 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 22 Jun 2023 00:16:39 -0500 Subject: [PATCH 222/501] [extractor/dplay] GlobalCyclingNetworkPlus: Add extractor (#7360) * Allows `country` API param to be configured with `--xff`/`geo_bypass_country` Closes #7324 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/dplay.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 15344a6e5a..a49a57a689 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -497,6 +497,7 @@ DiscoveryPlusItalyIE, DiscoveryPlusItalyShowIE, DiscoveryPlusIndiaShowIE, + GlobalCyclingNetworkPlusIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 8eb4d8ffa8..cf6d149342 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -65,6 +65,7 @@ def _download_video_playback_info(self, disco_base, video_id, headers): return streaming_list def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''): + country = self.get_param('geo_bypass_country') or country geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, @@ -1001,3 +1002,39 @@ class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE): _SHOW_STR = 'show' _INDEX = 4 _VIDEO_IE = DiscoveryPlusIndiaIE + + +class GlobalCyclingNetworkPlusIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://plus\.globalcyclingnetwork\.com/watch/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://plus.globalcyclingnetwork.com/watch/1397691', + 'info_dict': { + 'id': '1397691', + 'ext': 'mp4', + 'title': 'The Athertons: Mountain Biking\'s Fastest Family', + 'description': 'md5:75a81937fcd8b989eec6083a709cd837', + 'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/04/eb9e3026-4849-3001-8281-9356466f0557.png', + 'series': 'gcn', + 'creator': 'Gcn', + 'upload_date': '20210309', + 'timestamp': 1615248000, + 'duration': 2531.0, + 'tags': [], + }, + 'skip': 'Subscription required', + 'params': {'skip_download': 'm3u8'}, + }] + + _PRODUCT = 'web' + _DISCO_API_PARAMS = { + 'disco_host': 'disco-api-prod.globalcyclingnetwork.com', + 'realm': 'gcn', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:27.3.2', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) From 98cb1eda7a4cf67c96078980dbd63e6c06ad7f7c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 22 Jun 2023 00:24:52 -0500 Subject: [PATCH 223/501] [extractor/rheinmaintv] Add extractor (#7311) Authored by: barthelmannk Co-authored-by: barthelmannk <81305638+barthelmannk@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rheinmaintv.py | 94 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 yt_dlp/extractor/rheinmaintv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a49a57a689..49a3f39d37 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1617,6 +1617,7 @@ from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE +from .rheinmaintv import RheinMainTVIE from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .rockstargames import RockstarGamesIE diff --git a/yt_dlp/extractor/rheinmaintv.py b/yt_dlp/extractor/rheinmaintv.py new file mode 100644 index 0000000000..c3b352dede --- /dev/null +++ b/yt_dlp/extractor/rheinmaintv.py @@ -0,0 +1,94 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, merge_dicts, remove_end + + +class RheinMainTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)' + _TESTS = [{ + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/', + 'info_dict': { + 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022', + 'ext': 'ismv', # ismv+isma will be merged into mp4 + 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft', + 'title': 'Auf dem Weg zur Deutschen Meisterschaft', + 'upload_date': '20221108', + 'view_count': int, + 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft', + 'thumbnail': r're:^https://.+\.jpg', + 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9', + 'timestamp': 1667933057, + 'duration': 243.0, + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/', + 'info_dict': { + 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022', + 'ext': 'ismv', + 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', + 'timestamp': 1668526214, + 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften', + 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', + 'view_count': int, + 'thumbnail': r're:^https://.+\.jpg', + 'duration': 345.0, + 'description': 'md5:9370ba29526984006c2cba1372e5c5a0', + 'upload_date': '20221115', + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/', + 'info_dict': { + 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022', + 'ext': 'ismv', + 'title': 'Casino Mainz bei den Deutschen Meisterschaften', + 'view_count': int, + 'timestamp': 1668527402, + 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften', + 'upload_date': '20221115', + 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften', + 'duration': 348.0, + 'thumbnail': r're:^https://.+\.jpg', + 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa', + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('display_id') + video_id = mobj.group('video_id').replace('/', '-') + webpage = self._download_webpage(url, video_id) + + source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)', + webpage, 'video', group=('source', 'img')) + source = extract_attributes(source) + img = extract_attributes(img) + + raw_json_ld = list(self._yield_json_ld(webpage, video_id)) + json_ld = self._json_ld(raw_json_ld, video_id) + json_ld.pop('url', None) + + ism_manifest_url = ( + source.get('src') + or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject') + ) + formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'title': + self._html_search_regex(r'<h1><span class="title">([^<]*)</span>', + webpage, 'headline', default=None) + or img.get('title') or json_ld.get('title') or self._og_search_title(webpage) + or remove_end(self._html_extract_title(webpage), ' -'), + 'alt_title': img.get('alt'), + 'description': json_ld.get('description') or self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'), + }, json_ld) From 71dc18fa29263a1ff0472c23d81bfc8dd4422d48 Mon Sep 17 00:00:00 2001 From: Berkan Teber <github@accounts.berkanteber.com> Date: Thu, 22 Jun 2023 10:27:54 +0300 Subject: [PATCH 224/501] [extractor/youtube] Improve description parsing performance (#7315) * The parsing is skipped when not needed * The regex is improved by simulating atomic groups with lookaheads Authored by: pukkandan, berkanteber --- yt_dlp/extractor/youtube.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a90118680f..ef9f1f11c2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4346,15 +4346,21 @@ def process_language(container, base_url, lang_code, sub_name, query): info[d_k] = parse_duration(query[k][0]) # Youtube Music Auto-generated description - if video_description: + if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): + # XXX: Causes catastrophic backtracking if description has "·" + # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI + # Simulating atomic groups: (?P<a>[^xy]+)x => (?=(?P<a>[^xy]+))(?P=a)x + # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2 mobj = re.search( r'''(?xs) - (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ - (?P<album>[^\n]+) + (?=(?P<track>[^\n·]+))(?P=track)· + (?=(?P<artist>[^\n]+))(?P=artist)\n+ + (?=(?P<album>[^\n]+))(?P=album)\n (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))? (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? - (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? - .+\nAuto-generated\ by\ YouTube\.\s*$ + (.+?\nArtist\s*:\s* + (?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n + )?.+\nAuto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year') From b4e0d75848e9447cee2cd3646ce54d4744a7ff56 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 04:54:39 +0530 Subject: [PATCH 225/501] Improve `--download-sections` * Support negative time-ranges * Add `*from-url` to obey time-ranges in URL Closes #7248 --- README.md | 14 +++++----- yt_dlp/YoutubeDL.py | 6 +++-- yt_dlp/__init__.py | 59 +++++++++++++++++++++++++++++------------- yt_dlp/options.py | 5 ++-- yt_dlp/utils/_utils.py | 22 +++++++++++++--- 5 files changed, 74 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index d108321038..8db2d4f067 100644 --- a/README.md +++ b/README.md @@ -610,12 +610,14 @@ ## Download Options: --no-hls-use-mpegts Do not use the mpegts container for HLS videos. This is default when not downloading live streams - --download-sections REGEX Download only chapters whose title matches - the given regular expression. Time ranges - prefixed by a "*" can also be used in place - of chapters to download the specified range. - Needs ffmpeg. This option can be used - multiple times to download multiple + --download-sections REGEX Download only chapters that match the + regular expression. A "*" prefix denotes + time-range instead of chapter. Negative + timestamps are calculated from the end. + "*from-url" can be used to download between + the "start_time" and "end_time" extracted + from the URL. Needs ffmpeg. This option can + be used multiple times to download multiple sections, e.g. --download-sections "*10:15-inf" --download-sections "intro" --downloader [PROTO:]NAME Name or path of the external downloader to diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 79b7d47b03..6dade0b2a4 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2806,11 +2806,13 @@ def to_screen(*msg): new_info.update(fmt) offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') end_time = offset + min(chapter.get('end_time', duration), duration) + # duration may not be accurate. So allow deviations <1sec + if end_time == float('inf') or end_time > offset + duration + 1: + end_time = None if chapter or offset: new_info.update({ 'section_start': offset + chapter.get('start_time', 0), - # duration may not be accurate. So allow deviations <1sec - 'section_end': end_time if end_time <= offset + duration + 1 else None, + 'section_end': end_time, 'section_title': chapter.get('title'), 'section_number': chapter.get('index'), }) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 46edd88d3e..b81277a572 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -320,26 +320,49 @@ def validate_outtmpl(tmpl, msg): opts.skip_download = None del opts.outtmpl['default'] - def parse_chapters(name, value): - chapters, ranges = [], [] + def parse_chapters(name, value, advanced=False): parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x) - for regex in value or []: - if regex.startswith('*'): - for range_ in map(str.strip, regex[1:].split(',')): - mobj = range_ != '-' and re.fullmatch(r'([^-]+)?\s*-\s*([^-]+)?', range_) - dur = mobj and (parse_timestamp(mobj.group(1) or '0'), parse_timestamp(mobj.group(2) or 'inf')) - if None in (dur or [None]): - raise ValueError(f'invalid {name} time range "{regex}". Must be of the form "*start-end"') - ranges.append(dur) - continue - try: - chapters.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid {name} regex "{regex}" - {err}') - return chapters, ranges + TIMESTAMP_RE = r'''(?x)(?: + (?P<start_sign>-?)(?P<start>[^-]+) + )?\s*-\s*(?: + (?P<end_sign>-?)(?P<end>[^-]+) + )?''' - opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) - opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) + chapters, ranges, from_url = [], [], False + for regex in value or []: + if advanced and regex == '*from-url': + from_url = True + continue + elif not regex.startswith('*'): + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + continue + + for range_ in map(str.strip, regex[1:].split(',')): + mobj = range_ != '-' and re.fullmatch(TIMESTAMP_RE, range_) + dur = mobj and [parse_timestamp(mobj.group('start') or '0'), parse_timestamp(mobj.group('end') or 'inf')] + signs = mobj and (mobj.group('start_sign'), mobj.group('end_sign')) + + err = None + if None in (dur or [None]): + err = 'Must be of the form "*start-end"' + elif not advanced and any(signs): + err = 'Negative timestamps are not allowed' + else: + dur[0] *= -1 if signs[0] else 1 + dur[1] *= -1 if signs[1] else 1 + if dur[1] == float('-inf'): + err = '"-inf" is not a valid end' + if err: + raise ValueError(f'invalid {name} time range "{regex}". {err}') + ranges.append(dur) + + return chapters, ranges, from_url + + opts.remove_chapters, opts.remove_ranges, _ = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges, True)) # Cookies from browser if opts.cookiesfrombrowser: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9d6dbec9fc..163809706a 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1012,8 +1012,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--download-sections', metavar='REGEX', dest='download_ranges', action='append', help=( - 'Download only chapters whose title matches the given regular expression. ' - 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' + 'Download only chapters that match the regular expression. ' + 'A "*" prefix denotes time-range instead of chapter. Negative timestamps are calculated from the end. ' + '"*from-url" can be used to download between the "start_time" and "end_time" extracted from the URL. ' 'Needs ffmpeg. This option can be used multiple times to download multiple sections, ' 'e.g. --download-sections "*10:15-inf" --download-sections "intro"')) downloader.add_option( diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index bc1bc9116c..56acadd736 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -3753,11 +3753,11 @@ def _match_func(info_dict, incomplete=False): class download_range_func: - def __init__(self, chapters, ranges): - self.chapters, self.ranges = chapters, ranges + def __init__(self, chapters, ranges, from_info=False): + self.chapters, self.ranges, self.from_info = chapters, ranges, from_info def __call__(self, info_dict, ydl): - if not self.ranges and not self.chapters: + if not any((self.ranges, self.chapters, self.from_info)): yield {} warning = ('There are no chapters matching the regex' if info_dict.get('chapters') @@ -3770,7 +3770,21 @@ def __call__(self, info_dict, ydl): if self.chapters and warning: ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') - yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or []) + for start, end in self.ranges or []: + yield { + 'start_time': self._handle_negative_timestamp(start, info_dict), + 'end_time': self._handle_negative_timestamp(end, info_dict), + } + + if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')): + yield { + 'start_time': info_dict.get('start_time'), + 'end_time': info_dict.get('end_time'), + } + + @staticmethod + def _handle_negative_timestamp(time, info): + return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time def __eq__(self, other): return (isinstance(other, download_range_func) From cd810afe2ac5567c822b7424800fc470ef2d0045 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 13:23:31 +0530 Subject: [PATCH 226/501] [extractor/youtube] Improve nsig function name extraction --- test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 811f70e689..c5592845b3 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -159,6 +159,10 @@ 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', '1wWCVpRR96eAmMI87L', 'KSkWAVv1ZQxC3A', ), + ( + 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', + '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ef9f1f11c2..f1c8df1410 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3140,7 +3140,7 @@ def _extract_n_function_name(self, jscode): return funcname return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] def _extract_n_function_code(self, video_id, player_url): From 812cdfa06c33a40e73a8e04b3e6f42c084666a43 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 10:02:38 +0530 Subject: [PATCH 227/501] [cleanup] Misc --- README.md | 2 +- devscripts/changelog_override.json | 21 +++++++++++++++++++++ devscripts/make_changelog.py | 14 ++++++++------ yt_dlp/extractor/testurl.py | 8 ++++++-- yt_dlp/utils/_utils.py | 7 ++++--- 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 8db2d4f067..4de4ece969 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e)** ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/yt-dlp/yt-dlp/commit/42f2d4) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 73225bdb90..df80f45e0f 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -35,5 +35,26 @@ "when": "8417f26b8a819cd7ffcd4e000ca3e45033e670fb", "short": "Add option `--color` (#6904)", "authors": ["Grub4K"] + }, + { + "action": "change", + "when": "7b37e8b23691613f331bd4ebc9d639dd6f93c972", + "short": "Improve `--download-sections`\n - Support negative time-ranges\n - Add `*from-url` to obey time-ranges in URL" + }, + { + "action": "change", + "when": "1e75d97db21152acc764b30a688e516f04b8a142", + "short": "[extractor/youtube] Add `ios` to default clients used\n - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively\n - IOS also has higher bit-rate 'premium' formats though they are not labeled as such" + }, + { + "action": "change", + "when": "f2ff0f6f1914b82d4a51681a72cc0828115dcb4a", + "short": "[extractor/motherless] Add gallery support, fix groups (#7211)", + "authors": ["rexlambert22", "Ti4eeT4e"] + }, + { + "action": "change", + "when": "a4486bfc1dc7057efca9dd3fe70d7fa25c56f700", + "short": "[misc] Revert \"Add automatic duplicate issue detection\"" } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 2fcdc06d77..0bcfa6ae72 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -196,7 +196,7 @@ def _prepare_cleanup_misc_items(self, items): for commit_infos in cleanup_misc_items.values(): sorted_items.append(CommitInfo( 'cleanup', ('Miscellaneous',), ', '.join( - self._format_message_link(None, info.commit.hash) + self._format_message_link(None, info.commit.hash).strip() for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), [], Commit(None, '', commit_infos[0].commit.authors), [])) @@ -205,10 +205,10 @@ def _prepare_cleanup_misc_items(self, items): def format_single_change(self, info): message = self._format_message_link(info.message, info.commit.hash) if info.issues: - message = f'{message} ({self._format_issues(info.issues)})' + message = message.replace('\n', f' ({self._format_issues(info.issues)})\n', 1) if info.commit.authors: - message = f'{message} by {self._format_authors(info.commit.authors)}' + message = message.replace('\n', f' by {self._format_authors(info.commit.authors)}\n', 1) if info.fixes: fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes) @@ -217,14 +217,16 @@ def format_single_change(self, info): if authors != info.commit.authors: fix_message = f'{fix_message} by {self._format_authors(authors)}' - message = f'{message} (With fixes in {fix_message})' + message = message.replace('\n', f' (With fixes in {fix_message})\n', 1) - return message + return message[:-1] def _format_message_link(self, message, hash): assert message or hash, 'Improperly defined commit message or override' message = message if message else hash[:HASH_LENGTH] - return f'[{message}]({self.repo_url}/commit/{hash})' if hash else message + if not hash: + return f'{message}\n' + return f'[{message}\n'.replace('\n', f']({self.repo_url}/commit/{hash})\n', 1) def _format_issues(self, issues): return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues) diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py index 0da01aa53e..3cf0017765 100644 --- a/yt_dlp/extractor/testurl.py +++ b/yt_dlp/extractor/testurl.py @@ -8,7 +8,7 @@ class TestURLIE(InfoExtractor): """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list - _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>[0-9]+))?$' + _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>\d+|all))?$' def _real_extract(self, url): from . import gen_extractor_classes @@ -36,6 +36,10 @@ def _real_extract(self, url): extractor = matching_extractors[0] testcases = tuple(extractor.get_testcases(True)) + if num == 'all': + return self.playlist_result( + [self.url_result(tc['url'], extractor) for tc in testcases], + url, f'{extractor.IE_NAME} tests') try: tc = testcases[int(num or 0)] except IndexError: @@ -43,4 +47,4 @@ def _real_extract(self, url): f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True) self.to_screen(f'Test URL: {tc["url"]}') - return self.url_result(tc['url']) + return self.url_result(tc['url'], extractor) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 56acadd736..10052009fc 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -3507,7 +3507,8 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): }, } - sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) + sanitize_codec = functools.partial( + try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower()) vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) for ext in preferences or COMPATIBLE_CODECS.keys(): @@ -5737,9 +5738,9 @@ class FormatSorter: 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, - 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), + 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none', 'function': lambda it: next(filter(None, it), None)}, - 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), + 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes', 'function': lambda it: next(filter(None, it), None)}, 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, 'res': {'type': 'multiple', 'field': ('height', 'width'), From de4cf77ec1a13f020e6afe4ed04248c6b19fccb6 Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Thu, 22 Jun 2023 08:09:31 +0000 Subject: [PATCH 228/501] Release 2023.06.22 Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++--- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++--- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++--- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++--- CONTRIBUTORS | 5 ++++ Changelog.md | 29 +++++++++++++++++++ supportedsites.md | 3 ++ yt_dlp/version.py | 4 +-- 10 files changed, 63 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 351454b127..a00a11f271 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.21, Current version: 2023.06.21 - yt-dlp is up to date (2023.06.21) + Latest version: 2023.06.22, Current version: 2023.06.22 + yt-dlp is up to date (2023.06.22) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index b2a613e2f9..fc1f41ead5 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.21, Current version: 2023.06.21 - yt-dlp is up to date (2023.06.21) + Latest version: 2023.06.22, Current version: 2023.06.22 + yt-dlp is up to date (2023.06.22) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index c100561eb5..ed51dfa97d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.21, Current version: 2023.06.21 - yt-dlp is up to date (2023.06.21) + Latest version: 2023.06.22, Current version: 2023.06.22 + yt-dlp is up to date (2023.06.22) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index e97d7b5073..1c997f3e27 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.21, Current version: 2023.06.21 - yt-dlp is up to date (2023.06.21) + Latest version: 2023.06.22, Current version: 2023.06.22 + yt-dlp is up to date (2023.06.22) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index a44612d795..1638945bf5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.21, Current version: 2023.06.21 - yt-dlp is up to date (2023.06.21) + Latest version: 2023.06.22, Current version: 2023.06.22 + yt-dlp is up to date (2023.06.22) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index a15a469680..d27bd57426 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.06.21** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.21 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.21, Current version: 2023.06.21 - yt-dlp is up to date (2023.06.21) + Latest version: 2023.06.22, Current version: 2023.06.22 + yt-dlp is up to date (2023.06.22) <more lines> render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 3b35895d93..0864f16c4c 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -455,3 +455,8 @@ vampirefrog vidiot720 viktor-enzell zhgwn +barthelmannk +berkanteber +OverlordQ +rexlambert22 +Ti4eeT4e diff --git a/Changelog.md b/Changelog.md index d7a1cb4953..c340b74c9c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,35 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.06.22 + +#### Core changes +- [Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb](https://github.com/yt-dlp/yt-dlp/commit/d7cd97e8d8d42b500fea9abb2aa4ac9b0f98b2ad) by [pukkandan](https://github.com/pukkandan) +- [Improve `--download-sections`](https://github.com/yt-dlp/yt-dlp/commit/b4e0d75848e9447cee2cd3646ce54d4744a7ff56) by [pukkandan](https://github.com/pukkandan) +- [Indicate `filesize` approximated from `tbr` better](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) by [pukkandan](https://github.com/pukkandan) + +#### Extractor changes +- [Support multiple `_VALID_URL`s](https://github.com/yt-dlp/yt-dlp/commit/5fd8367496b42c7b900b896a0d5460561a2859de) ([#5812](https://github.com/yt-dlp/yt-dlp/issues/5812)) by [nixxo](https://github.com/nixxo) +- **dplay**: GlobalCyclingNetworkPlus: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/774aa09dd6aa61ced9ec818d1f67e53414d22762) ([#7360](https://github.com/yt-dlp/yt-dlp/issues/7360)) by [bashonly](https://github.com/bashonly) +- **dropout**: [Fix season extraction](https://github.com/yt-dlp/yt-dlp/commit/db22142f6f817ff673d417b4b78e8db497bf8ab3) ([#7304](https://github.com/yt-dlp/yt-dlp/issues/7304)) by [OverlordQ](https://github.com/OverlordQ) +- **motherless**: [Add gallery support, fix groups](https://github.com/yt-dlp/yt-dlp/commit/f2ff0f6f1914b82d4a51681a72cc0828115dcb4a) ([#7211](https://github.com/yt-dlp/yt-dlp/issues/7211)) by [rexlambert22](https://github.com/rexlambert22), [Ti4eeT4e](https://github.com/Ti4eeT4e) +- **nebula**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3f756c8c4095b942cf49788eb0862ceaf57847f2) ([#7156](https://github.com/yt-dlp/yt-dlp/issues/7156)) by [Lamieur](https://github.com/Lamieur), [rohieb](https://github.com/rohieb) +- **rheinmaintv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/98cb1eda7a4cf67c96078980dbd63e6c06ad7f7c) ([#7311](https://github.com/yt-dlp/yt-dlp/issues/7311)) by [barthelmannk](https://github.com/barthelmannk) +- **youtube** + - [Add `ios` to default clients used](https://github.com/yt-dlp/yt-dlp/commit/1e75d97db21152acc764b30a688e516f04b8a142) + - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively + - IOS also has higher bit-rate 'premium' formats though they are not labeled as such + - [Improve description parsing performance](https://github.com/yt-dlp/yt-dlp/commit/71dc18fa29263a1ff0472c23d81bfc8dd4422d48) ([#7315](https://github.com/yt-dlp/yt-dlp/issues/7315)) by [berkanteber](https://github.com/berkanteber), [pukkandan](https://github.com/pukkandan) + - [Improve nsig function name extraction](https://github.com/yt-dlp/yt-dlp/commit/cd810afe2ac5567c822b7424800fc470ef2d0045) by [pukkandan](https://github.com/pukkandan) + - [Workaround 403 for android formats](https://github.com/yt-dlp/yt-dlp/commit/81ca451480051d7ce1a31c017e005358345a9149) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- [Revert "Add automatic duplicate issue detection"](https://github.com/yt-dlp/yt-dlp/commit/a4486bfc1dc7057efca9dd3fe70d7fa25c56f700) +- **cleanup** + - Miscellaneous + - [7f9c6a6](https://github.com/yt-dlp/yt-dlp/commit/7f9c6a63b16e145495479e9f666f5b9e2ee69e2f) by [bashonly](https://github.com/bashonly) + - [812cdfa](https://github.com/yt-dlp/yt-dlp/commit/812cdfa06c33a40e73a8e04b3e6f42c084666a43) by [pukkandan](https://github.com/pukkandan) + ### 2023.06.21 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 882b272aab..7d99d9e227 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -515,6 +515,7 @@ # Supported sites - **GlattvisionTVLive**: [*glattvisiontv*](## "netrc machine") - **GlattvisionTVRecordings**: [*glattvisiontv*](## "netrc machine") - **Glide**: Glide mobile video messages (glide.me) + - **GlobalCyclingNetworkPlus** - **GlobalPlayerAudio** - **GlobalPlayerAudioEpisode** - **GlobalPlayerLive** @@ -814,6 +815,7 @@ # Supported sites - **MonsterSirenHypergryphMusic** - **Morningstar**: morningstar.com - **Motherless** + - **MotherlessGallery** - **MotherlessGroup** - **Motorsport**: motorsport.com - **MotorTrend** @@ -1198,6 +1200,7 @@ # Supported sites - **Restudy** - **Reuters** - **ReverbNation** + - **RheinMainTV** - **RICE** - **RMCDecouverte** - **RockstarGames** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index f4474db9a5..434f36f48f 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.06.21' +__version__ = '2023.06.22' -RELEASE_GIT_HEAD = '42f2d40b475db66486a4b4fe5b56751a640db5db' +RELEASE_GIT_HEAD = '812cdfa06c33a40e73a8e04b3e6f42c084666a43' VARIANT = None From 89bed013741a776506f60380b7fd89d27d0710b4 Mon Sep 17 00:00:00 2001 From: Simon <35427372+bbilly1@users.noreply.github.com> Date: Fri, 23 Jun 2023 01:08:42 +0700 Subject: [PATCH 229/501] [extractor/youtube] Fix comments' `is_favorited` (#7390) Authored by: bbilly1 Closes #7389 --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index f1c8df1410..a0d0a601ae 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3356,7 +3356,7 @@ def _extract_comment(self, comment_renderer, parent=None): info['author_is_uploader'] = author_is_uploader comment_abr = traverse_obj( - comment_renderer, ('actionsButtons', 'commentActionButtonsRenderer'), expected_type=dict) + comment_renderer, ('actionButtons', 'commentActionButtonsRenderer'), expected_type=dict) if comment_abr is not None: info['is_favorited'] = 'creatorHeart' in comment_abr From e59e20744eb32ce4b6ea0dece7c673be8376a710 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Jun 2023 23:22:14 +0530 Subject: [PATCH 230/501] Bugfix for b4e0d75848e9447cee2cd3646ce54d4744a7ff56 --- yt_dlp/utils/_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 10052009fc..de51f62083 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -3758,8 +3758,6 @@ def __init__(self, chapters, ranges, from_info=False): self.chapters, self.ranges, self.from_info = chapters, ranges, from_info def __call__(self, info_dict, ydl): - if not any((self.ranges, self.chapters, self.from_info)): - yield {} warning = ('There are no chapters matching the regex' if info_dict.get('chapters') else 'Cannot match chapters since chapter information is unavailable') @@ -3779,9 +3777,11 @@ def __call__(self, info_dict, ydl): if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')): yield { - 'start_time': info_dict.get('start_time'), - 'end_time': info_dict.get('end_time'), + 'start_time': info_dict.get('start_time') or 0, + 'end_time': info_dict.get('end_time') or float('inf'), } + elif not self.ranges and not self.chapters: + yield {} @staticmethod def _handle_negative_timestamp(time, info): From 58786a10f212bd63f9ad1d0b4d9e4d31c3b385e2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 25 Jun 2023 20:10:00 +0530 Subject: [PATCH 231/501] [extractor/youtube] Add extractor-arg `formats` Closes #7417 --- README.md | 3 +-- yt_dlp/extractor/youtube.py | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4de4ece969..d89bb204e8 100644 --- a/README.md +++ b/README.md @@ -1805,8 +1805,7 @@ #### youtube * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `include_duplicate_formats`: Extract formats with identical content but different URLs or protocol. This is useful if some of the formats are unavailable or throttled. -* `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8) +* `formats`: Change the types of formats to return. `dashy` (convert http to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a0d0a601ae..bdc631ccb8 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3752,7 +3752,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...)) - all_formats = self._configuration_arg('include_duplicate_formats') + format_types = self._configuration_arg('formats') + all_formats = 'duplicate' in format_types + if self._configuration_arg('include_duplicate_formats'): + all_formats = True + self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. ' + 'Use formats=duplicate extractor argument instead') def build_fragments(f): return LazyList({ @@ -3892,18 +3897,23 @@ def build_fragments(f): if single_stream and dct.get('ext'): dct['container'] = dct['ext'] + '_dash' - if all_formats and dct['filesize']: + if (all_formats or 'dashy' in format_types) and dct['filesize']: yield { **dct, 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], 'protocol': 'http_dash_segments', 'fragments': build_fragments(dct), } - dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} - yield dct + if all_formats or 'dashy' not in format_types: + dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} + yield dct needs_live_processing = self._needs_live_processing(live_status, duration) - skip_bad_formats = not self._configuration_arg('include_incomplete_formats') + skip_bad_formats = 'incomplete' not in format_types + if self._configuration_arg('include_incomplete_formats'): + skip_bad_formats = False + self._downloader.deprecated_feature('[youtube] include_incomplete_formats extractor argument is deprecated. ' + 'Use formats=incomplete extractor argument instead') skip_manifests = set(self._configuration_arg('skip')) if (not self.get_param('youtube_include_hls_manifest', True) @@ -3915,7 +3925,7 @@ def build_fragments(f): skip_manifests.add('dash') if self._configuration_arg('include_live_dash'): self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. ' - 'Use include_incomplete_formats extractor argument instead') + 'Use formats=incomplete extractor argument instead') elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') From f0a1ff118145b6449982ba401f9a9f656ecd8062 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 25 Jun 2023 13:13:28 -0500 Subject: [PATCH 232/501] [extractor/qdance] Add extractor (#7420) Closes #7385 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/qdance.py | 150 ++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 yt_dlp/extractor/qdance.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 49a3f39d37..06340fcd8d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1531,6 +1531,7 @@ ) from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qdance import QDanceIE from .qingting import QingTingIE from .qqmusic import ( QQMusicIE, diff --git a/yt_dlp/extractor/qdance.py b/yt_dlp/extractor/qdance.py new file mode 100644 index 0000000000..d817677f0e --- /dev/null +++ b/yt_dlp/extractor/qdance.py @@ -0,0 +1,150 @@ +import json +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + jwt_decode_hs256, + str_or_none, + traverse_obj, + try_call, + url_or_none, +) + + +class QDanceIE(InfoExtractor): + _NETRC_MACHINE = 'qdance' + _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P<id>\d+)' + _TESTS = [{ + 'note': 'vod', + 'url': 'https://www.q-dance.com/network/library/146542138', + 'info_dict': { + 'id': '146542138', + 'ext': 'mp4', + 'title': 'Sound Rush [LIVE] | Defqon.1 Weekend Festival 2022 | Friday | RED', + 'display_id': 'sound-rush-live-v3-defqon-1-weekend-festival-2022-friday-red', + 'description': 'Relive Defqon.1 - Primal Energy 2022 with the sounds of Sound Rush LIVE at the RED on Friday! 🔥', + 'season': 'Defqon.1 Weekend Festival 2022', + 'season_id': '31840632', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1674829540-20220624171509-220624171509_delio_dn201093-2.jpg', + 'availability': 'premium_only', + 'duration': 1829, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'livestream', + 'url': 'https://www.q-dance.com/network/live/149170353', + 'info_dict': { + 'id': '149170353', + 'ext': 'mp4', + 'title': r're:^Defqon\.1 2023 - Friday - RED', + 'display_id': 'defqon-1-2023-friday-red', + 'description': 'md5:3c73fbbd4044e578e696adfc64019163', + 'season': 'Defqon.1 Weekend Festival 2023', + 'season_id': '141735599', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1686849069-area-thumbs_red.png', + 'availability': 'subscriber_only', + 'live_status': 'is_live', + 'channel_id': 'qdancenetwork.video_149170353', + }, + 'skip': 'Completed livestream', + }] + + _access_token = None + _refresh_token = None + + def _call_login_api(self, data, note='Logging in'): + login = self._download_json( + 'https://members.id-t.com/api/auth/login', None, note, headers={ + 'content-type': 'application/json', + 'brand': 'qdance', + 'origin': 'https://www.q-dance.com', + 'referer': 'https://www.q-dance.com/', + }, data=json.dumps(data, separators=(',', ':')).encode(), + expected_status=lambda x: True) + + tokens = traverse_obj(login, ('data', { + '_id-t-accounts-token': ('accessToken', {str}), + '_id-t-accounts-refresh': ('refreshToken', {str}), + '_id-t-accounts-id-token': ('idToken', {str}), + })) + + if not tokens.get('_id-t-accounts-token'): + error = ': '.join(traverse_obj(login, ('error', ('code', 'message'), {str}))) + if 'validation_error' not in error: + raise ExtractorError(f'Q-Dance API said "{error}"') + msg = 'Invalid username or password' if 'email' in data else 'Refresh token has expired' + raise ExtractorError(msg, expected=True) + + for name, value in tokens.items(): + self._set_cookie('.q-dance.com', name, value) + + def _perform_login(self, username, password): + self._call_login_api({'email': username, 'password': password}) + + def _real_initialize(self): + cookies = self._get_cookies('https://www.q-dance.com/') + self._refresh_token = try_call(lambda: cookies['_id-t-accounts-refresh'].value) + self._access_token = try_call(lambda: cookies['_id-t-accounts-token'].value) + if not self._access_token: + self.raise_login_required() + + def _get_auth(self): + if (try_call(lambda: jwt_decode_hs256(self._access_token)['exp']) or 0) <= int(time.time() - 120): + if not self._refresh_token: + raise ExtractorError( + 'Cannot refresh access token, login with yt-dlp or refresh cookies in browser') + self._call_login_api({'refreshToken': self._refresh_token}, note='Refreshing access token') + self._real_initialize() + + return {'Authorization': self._access_token} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nuxt_data(webpage, video_id, traverse=('data', 0, 'data')) + + def extract_availability(level): + level = int_or_none(level) or 0 + return self._availability( + needs_premium=(level >= 20), needs_subscription=(level >= 15), needs_auth=True) + + info = traverse_obj(data, { + 'title': ('title', {str.strip}), + 'description': ('description', {str.strip}), + 'display_id': ('slug', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}), + 'availability': ('subscription', 'level', {extract_availability}), + 'is_live': ('type', {lambda x: x.lower() == 'live'}), + 'artist': ('acts', ..., {str}), + 'series': ('event', 'title', {str.strip}), + 'series_id': ('event', 'id', {str_or_none}), + 'season': ('eventEdition', 'title', {str.strip}), + 'season_id': ('eventEdition', 'id', {str_or_none}), + 'channel_id': ('pubnub', 'channelName', {str}), + }) + + stream = self._download_json( + f'https://dc9h6qmsoymbq.cloudfront.net/api/content/videos/{video_id}/url', + video_id, headers=self._get_auth(), expected_status=401) + + m3u8_url = traverse_obj(stream, ('data', 'url', {url_or_none})) + if not m3u8_url and traverse_obj(stream, ('error', 'code')) == 'unauthorized': + raise ExtractorError('Your account does not have access to this content', expected=True) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, fatal=False, live=True) if m3u8_url else [] + if not formats: + self.raise_no_formats('No active streams found', expected=bool(info.get('is_live'))) + + return { + **info, + 'id': video_id, + 'formats': formats, + } From 5e16cf92eb496b7c1541a6b1d727cb87542984db Mon Sep 17 00:00:00 2001 From: nnoboa <90611593+nnoboa@users.noreply.github.com> Date: Sun, 25 Jun 2023 16:22:38 -0400 Subject: [PATCH 233/501] [extractor/AdultSwim] Extract subtitles from m3u8 (#7421) Authored by: nnoboa Closes #6191 --- yt_dlp/extractor/adultswim.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/adultswim.py b/yt_dlp/extractor/adultswim.py index bd29eb43e5..daaeddeb6e 100644 --- a/yt_dlp/extractor/adultswim.py +++ b/yt_dlp/extractor/adultswim.py @@ -170,8 +170,10 @@ def _real_extract(self, url): continue ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) if ext == 'm3u8': - info['formats'].extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + info['formats'].extend(fmts) + self._merge_subtitles(subs, target=info['subtitles']) elif ext == 'f4m': continue # info['formats'].extend(self._extract_f4m_formats( From ef8509c300ea50da86aea447eb214d3d6f6db6bb Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sun, 25 Jun 2023 17:04:42 -0500 Subject: [PATCH 234/501] [extractor/kick] Fix `_VALID_URL` Closes #7384 Authored by: bashonly --- yt_dlp/extractor/kick.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 765ffa0c80..be1dfd4b16 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -30,7 +30,7 @@ def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, * class KickIE(KickBaseIE): - _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w_]+)' + _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://kick.com/yuppy', 'info_dict': { From d949c10c45bfc359bdacd52e6a180169b8128958 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Jun 2023 07:25:47 +0530 Subject: [PATCH 235/501] [extractor/youtube] Process `post_live` over 2 hours --- yt_dlp/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index bdc631ccb8..d5607975e5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3737,7 +3737,7 @@ def append_client(*client_names): def _needs_live_processing(self, live_status, duration): if (live_status == 'is_live' and self.get_param('live_from_start') - or live_status == 'post_live' and (duration or 0) > 4 * 3600): + or live_status == 'post_live' and (duration or 0) > 2 * 3600): return live_status def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): @@ -4238,7 +4238,7 @@ def is_bad_format(fmt): for fmt in filter(is_bad_format, formats): fmt['preference'] = (fmt.get('preference') or -1) - 10 - fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ') + fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 2 hours)', delim=' ') if needs_live_processing: self._prepare_live_from_start_formats( From 8a8af356e3bba98a7f7d333aff0777d5d92130c8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Jun 2023 16:13:31 +0530 Subject: [PATCH 236/501] [downloader/aria2c] Add `--no-conf` Closes #7404 --- yt_dlp/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 007689a8c9..f637a100bf 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -271,7 +271,7 @@ def _call_downloader(self, tmpfilename, info_dict): return super()._call_downloader(tmpfilename, info_dict) def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-c', + cmd = [self.exe, '-c', '--no-conf', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16'] if 'fragments' in info_dict: From f393bbe724b1fc6c7f754a5da507e807b2b40ad2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Jun 2023 16:14:20 +0530 Subject: [PATCH 237/501] [extractor/sbs] Python 3.7 compat Closes #7410 --- yt_dlp/extractor/sbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index ac0b6de202..119106e8ef 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -139,8 +139,8 @@ def _real_extract(self, url): 'release_year': ('releaseYear', {int_or_none}), 'duration': ('duration', ({float_or_none}, {parse_duration})), 'is_live': ('liveStream', {bool}), - 'age_limit': ( - ('classificationID', 'contentRating'), {str.upper}, {self._AUS_TV_PARENTAL_GUIDELINES.get}), + 'age_limit': (('classificationID', 'contentRating'), {str.upper}, { + lambda x: self._AUS_TV_PARENTAL_GUIDELINES.get(x)}), # dict.get is unhashable in py3.7 }, get_all=False), **traverse_obj(media, { 'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}), From 91302ed349f34dc26cc1d661bb45a4b71f4417f7 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Jun 2023 16:19:49 +0530 Subject: [PATCH 238/501] [utils] clean_podcast_url: Handle protocol in redirect URL Closes #7430 --- yt_dlp/utils/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index de51f62083..f68cdb9686 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5113,7 +5113,7 @@ def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', def clean_podcast_url(url): - return re.sub(r'''(?x) + url = re.sub(r'''(?x) (?: (?: chtbl\.com/track| @@ -5127,6 +5127,7 @@ def clean_podcast_url(url): st\.fm # https://podsights.com/docs/ )/e )/''', '', url) + return re.sub(r'^\w+://(\w+://)', r'\1', url) _HEX_TABLE = '0123456789abcdef' From 5b4b92769afcc398475e481bfa839f1158902fe9 Mon Sep 17 00:00:00 2001 From: Aman Salwan <121633121+AmanSal1@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:58:23 +0530 Subject: [PATCH 239/501] [extractor/crunchyroll:music] Fix `_VALID_URL` (#7439) Closes #7419 Authored by: AmanSal1, rdamas Co-authored-by: Robert Damas <robert.damas@byom.de> --- yt_dlp/extractor/crunchyroll.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index d4a21616ba..910504ed29 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -490,8 +490,21 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?crunchyroll\.com/ (?P<lang>(?:\w{2}(?:-\w{2})?/)?) - watch/(?P<type>concert|musicvideo)/(?P<id>\w{10})''' + watch/(?P<type>concert|musicvideo)/(?P<id>\w+)''' _TESTS = [{ + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV5B02C79', + 'display_id': 'egaono-hana', + 'title': 'Egaono Hana', + 'track': 'Egaono Hana', + 'artist': 'Goose house', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', 'info_dict': { 'ext': 'mp4', @@ -519,11 +532,14 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79/egaono-hana', 'only_matching': True, }, { 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, }] _API_ENDPOINT = 'music' From 8f05fbae2a79ce0713077ccc68b354e63216bf20 Mon Sep 17 00:00:00 2001 From: Xiao Han <38774211+meliber@users.noreply.github.com> Date: Tue, 27 Jun 2023 16:16:57 -0500 Subject: [PATCH 240/501] [extractor/abc] Fix extraction (#7434) Closes #6433 Authored by: meliber --- yt_dlp/extractor/abc.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 0ca76b85a8..f56133eb3e 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -12,6 +12,7 @@ int_or_none, parse_iso8601, str_or_none, + traverse_obj, try_get, unescapeHTML, update_url_query, @@ -85,6 +86,15 @@ class ABCIE(InfoExtractor): 'uploader': 'Behind the News', 'uploader_id': 'behindthenews', } + }, { + 'url': 'https://www.abc.net.au/news/2023-06-25/wagner-boss-orders-troops-back-to-bases-to-avoid-bloodshed/102520540', + 'info_dict': { + 'id': '102520540', + 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus', + 'ext': 'mp4', + 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.', + 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485', + } }] def _real_extract(self, url): @@ -107,7 +117,7 @@ def _real_extract(self, url): video = True if mobj is None: - mobj = re.search(r'(?P<type>)"sources": (?P<json_data>\[[^\]]+\]),', webpage) + mobj = re.search(r'(?P<type>)"(?:sources|files|renditions)":\s*(?P<json_data>\[[^\]]+\])', webpage) if mobj is None: mobj = re.search( r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', @@ -121,7 +131,8 @@ def _real_extract(self, url): urls_info = self._parse_json( mobj.group('json_data'), video_id, transform_source=js_to_json) youtube = mobj.group('type') == 'YouTube' - video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4' + video = mobj.group('type') == 'Video' or traverse_obj( + urls_info, (0, ('contentType', 'MIMEType')), get_all=False) == 'video/mp4' if not isinstance(urls_info, list): urls_info = [urls_info] From a2be9781fbf4d7e4db245c277ca2ecc41cf3a7b2 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Tue, 27 Jun 2023 16:50:02 -0500 Subject: [PATCH 241/501] [extractor/Douyin] Fix extraction from webpage Closes #7431 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9c6d74007d..2f491c3170 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1015,18 +1015,16 @@ def _real_extract(self, url): self.to_screen(f'{e}; trying with webpage') webpage = self._download_webpage(url, video_id) - render_data_json = self._search_regex( - r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>(%7B.+%7D)</script>', - webpage, 'render data', default=None) - if not render_data_json: + render_data = self._search_json( + r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id, + contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote) + if not render_data: # TODO: Run verification challenge code to generate signature cookies cookies = self._get_cookies(self._WEBPAGE_HOST) expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid') raise ExtractorError( 'Fresh cookies (not necessarily logged in) are needed', expected=expected) - render_data = self._parse_json( - render_data_json, video_id, transform_source=compat_urllib_parse_unquote) return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id) From fcbc9ed760be6e3455bbadfaf277b4504b06f068 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Thu, 29 Jun 2023 23:26:27 +0000 Subject: [PATCH 242/501] [extractor/youtube:tab] Support shorts-only playlists (#7425) Fixes https://github.com/yt-dlp/yt-dlp/issues/7424 Authored by: coletdjnz Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> --- yt_dlp/extractor/youtube.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d5607975e5..967914c0f7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4898,7 +4898,8 @@ def _extract_entries(self, parent_renderer, continuation_list): 'videoRenderer': lambda x: [self._video_entry(x)], 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), - 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)] + 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)], + 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list), } for key, renderer in isr_content.items(): if key not in known_renderers: @@ -6390,6 +6391,28 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_is_verified': True, }, 'playlist_mincount': 10, + }, { + # Playlist with only shorts, shown as reel renderers + # FIXME: future: YouTube currently doesn't give continuation for this, + # may do in future. + 'url': 'https://www.youtube.com/playlist?list=UUxqPAgubo4coVn9Lx1FuKcg', + 'info_dict': { + 'id': 'UUxqPAgubo4coVn9Lx1FuKcg', + 'channel_url': 'https://www.youtube.com/channel/UCxqPAgubo4coVn9Lx1FuKcg', + 'view_count': int, + 'uploader_id': '@BangyShorts', + 'description': '', + 'uploader_url': 'https://www.youtube.com/@BangyShorts', + 'channel_id': 'UCxqPAgubo4coVn9Lx1FuKcg', + 'channel': 'Bangy Shorts', + 'uploader': 'Bangy Shorts', + 'tags': [], + 'availability': 'public', + 'modified_date': '20230626', + 'title': 'Uploads from Bangy Shorts', + }, + 'playlist_mincount': 100, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }] @classmethod From af1fd12f675220df6793fc019dff320bc76e8080 Mon Sep 17 00:00:00 2001 From: urectanc <5403400+urectanc@users.noreply.github.com> Date: Sat, 1 Jul 2023 03:27:07 +0900 Subject: [PATCH 243/501] [extractor/stacommu] Add extractors (#7432) Authored by: urectanc --- README.md | 2 +- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/stacommu.py | 148 ++++++++++++++++++++++++++++ yt_dlp/extractor/wrestleuniverse.py | 33 ++++--- 4 files changed, 173 insertions(+), 14 deletions(-) create mode 100644 yt_dlp/extractor/stacommu.py diff --git a/README.md b/README.md index d89bb204e8..066ff90528 100644 --- a/README.md +++ b/README.md @@ -1855,7 +1855,7 @@ #### rokfinchannel #### twitter * `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed -#### wrestleuniverse +#### stacommu, wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage #### twitch diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 06340fcd8d..76a7fef23e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1855,6 +1855,10 @@ SRGSSRPlayIE, ) from .srmediathek import SRMediathekIE +from .stacommu import ( + StacommuLiveIE, + StacommuVODIE, +) from .stanfordoc import StanfordOpenClassroomIE from .startv import StarTVIE from .steam import ( diff --git a/yt_dlp/extractor/stacommu.py b/yt_dlp/extractor/stacommu.py new file mode 100644 index 0000000000..6f58f06dc8 --- /dev/null +++ b/yt_dlp/extractor/stacommu.py @@ -0,0 +1,148 @@ +import time + +from .wrestleuniverse import WrestleUniverseBaseIE +from ..utils import ( + int_or_none, + traverse_obj, + url_or_none, +) + + +class StacommuBaseIE(WrestleUniverseBaseIE): + _NETRC_MACHINE = 'stacommu' + _API_HOST = 'api.stacommu.jp' + _LOGIN_QUERY = {'key': 'AIzaSyCR9czxhH2eWuijEhTNWBZ5MCcOYEUTAhg'} + _LOGIN_HEADERS = { + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web', + 'Referer': 'https://www.stacommu.jp/', + 'Origin': 'https://www.stacommu.jp', + } + + @WrestleUniverseBaseIE._TOKEN.getter + def _TOKEN(self): + if self._REAL_TOKEN and self._TOKEN_EXPIRY <= int(time.time()): + self._refresh_token() + + return self._REAL_TOKEN + + def _get_formats(self, data, path, video_id=None): + if not traverse_obj(data, path) and not data.get('canWatch') and not self._TOKEN: + self.raise_login_required(method='password') + return super()._get_formats(data, path, video_id) + + def _extract_hls_key(self, data, path, decrypt): + encryption_data = traverse_obj(data, path) + if traverse_obj(encryption_data, ('encryptType', {int})) == 0: + return None + return traverse_obj(encryption_data, {'key': ('key', {decrypt}), 'iv': ('iv', {decrypt})}) + + +class StacommuVODIE(StacommuBaseIE): + _VALID_URL = r'https?://www\.stacommu\.jp/videos/episodes/(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + # not encrypted + 'url': 'https://www.stacommu.jp/videos/episodes/aXcVKjHyAENEjard61soZZ', + 'info_dict': { + 'id': 'aXcVKjHyAENEjard61soZZ', + 'ext': 'mp4', + 'title': 'スタコミュAWARDの裏側、ほぼ全部見せます!〜晴れ舞台の直前ドキドキ編〜', + 'description': 'md5:6400275c57ae75c06da36b06f96beb1c', + 'timestamp': 1679652000, + 'upload_date': '20230324', + 'thumbnail': 'https://image.stacommu.jp/6eLobQan8PFtBoU4RL4uGg/6eLobQan8PFtBoU4RL4uGg', + 'cast': 'count:11', + 'duration': 250, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # encrypted; requires a premium account + 'url': 'https://www.stacommu.jp/videos/episodes/3hybMByUvzMEqndSeu5LpD', + 'info_dict': { + 'id': '3hybMByUvzMEqndSeu5LpD', + 'ext': 'mp4', + 'title': 'スタプラフェス2023〜裏側ほぼ全部見せます〜#10', + 'description': 'md5:85494488ccf1dfa1934accdeadd7b340', + 'timestamp': 1682506800, + 'upload_date': '20230426', + 'thumbnail': 'https://image.stacommu.jp/eMdXtEefR4kEyJJMpAFi7x/eMdXtEefR4kEyJJMpAFi7x', + 'cast': 'count:55', + 'duration': 312, + 'hls_aes': { + 'key': '6bbaf241b8e1fd9f59ecf546a70e4ae7', + 'iv': '1fc9002a23166c3bb1d240b953d09de9', + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + _API_PATH = 'videoEpisodes' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_metadata( + url, video_id, 'ja', ('dehydratedState', 'queries', 0, 'state', 'data')) + hls_info, decrypt = self._call_encrypted_api( + video_id, ':watch', 'stream information', data={'method': 1}) + + return { + 'id': video_id, + 'formats': self._get_formats(hls_info, ('protocolHls', 'url', {url_or_none}), video_id), + 'hls_aes': self._extract_hls_key(hls_info, 'protocolHls', decrypt), + **traverse_obj(video_info, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'timestamp': ('watchStartTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'cast': ('casts', ..., 'displayName', {str}), + 'duration': ('duration', {int}), + }), + } + + +class StacommuLiveIE(StacommuBaseIE): + _VALID_URL = r'https?://www\.stacommu\.jp/live/(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.stacommu.jp/live/d2FJ3zLnndegZJCAEzGM3m', + 'info_dict': { + 'id': 'd2FJ3zLnndegZJCAEzGM3m', + 'ext': 'mp4', + 'title': '仲村悠菜 2023/05/04', + 'timestamp': 1683195647, + 'upload_date': '20230504', + 'thumbnail': 'https://image.stacommu.jp/pHGF57SPEHE2ke83FS92FN/pHGF57SPEHE2ke83FS92FN', + 'duration': 5322, + 'hls_aes': { + 'key': 'efbb3ec0b8246f61adf1764c5a51213a', + 'iv': '80621d19a1f19167b64cedb415b05d1c', + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + _API_PATH = 'events' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._call_api(video_id, msg='video information', query={'al': 'ja'}, auth=False) + hls_info, decrypt = self._call_encrypted_api( + video_id, ':watchArchive', 'stream information', data={'method': 1}) + + return { + 'id': video_id, + 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id), + 'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt), + **traverse_obj(video_info, { + 'title': ('displayName', {str}), + 'timestamp': ('startTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'duration': ('duration', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index b12b0f0a9e..99a8f01200 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -14,12 +14,14 @@ try_call, url_or_none, urlencode_postdata, + variadic, ) class WrestleUniverseBaseIE(InfoExtractor): _NETRC_MACHINE = 'wrestleuniverse' _VALID_URL_TMPL = r'https?://(?:www\.)?wrestle-universe\.com/(?:(?P<lang>\w{2})/)?%s/(?P<id>\w+)' + _API_HOST = 'api.wrestle-universe.com' _API_PATH = None _REAL_TOKEN = None _TOKEN_EXPIRY = None @@ -67,24 +69,28 @@ def _perform_login(self, username, password): 'returnSecureToken': True, 'email': username, 'password': password, - }, separators=(',', ':')).encode()) + }, separators=(',', ':')).encode(), expected_status=400) + token = traverse_obj(login, ('idToken', {str})) + if not token: + raise ExtractorError( + f'Unable to log in: {traverse_obj(login, ("error", "message"))}', expected=True) self._REFRESH_TOKEN = traverse_obj(login, ('refreshToken', {str})) if not self._REFRESH_TOKEN: self.report_warning('No refresh token was granted') - self._TOKEN = traverse_obj(login, ('idToken', {str})) + self._TOKEN = token def _real_initialize(self): - if WrestleUniverseBaseIE._DEVICE_ID: + if self._DEVICE_ID: return - WrestleUniverseBaseIE._DEVICE_ID = self._configuration_arg('device_id', [None], ie_key='WrestleUniverse')[0] - if not WrestleUniverseBaseIE._DEVICE_ID: - WrestleUniverseBaseIE._DEVICE_ID = self.cache.load(self._NETRC_MACHINE, 'device_id') - if WrestleUniverseBaseIE._DEVICE_ID: + self._DEVICE_ID = self._configuration_arg('device_id', [None], ie_key=self._NETRC_MACHINE)[0] + if not self._DEVICE_ID: + self._DEVICE_ID = self.cache.load(self._NETRC_MACHINE, 'device_id') + if self._DEVICE_ID: return - WrestleUniverseBaseIE._DEVICE_ID = str(uuid.uuid4()) + self._DEVICE_ID = str(uuid.uuid4()) - self.cache.store(self._NETRC_MACHINE, 'device_id', WrestleUniverseBaseIE._DEVICE_ID) + self.cache.store(self._NETRC_MACHINE, 'device_id', self._DEVICE_ID) def _refresh_token(self): refresh = self._download_json( @@ -108,10 +114,10 @@ def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={ if data: headers['Content-Type'] = 'application/json;charset=utf-8' data = json.dumps(data, separators=(',', ':')).encode() - if auth: + if auth and self._TOKEN: headers['Authorization'] = f'Bearer {self._TOKEN}' return self._download_json( - f'https://api.wrestle-universe.com/v1/{self._API_PATH}/{video_id}{param}', video_id, + f'https://{self._API_HOST}/v1/{self._API_PATH}/{video_id}{param}', video_id, note=f'Downloading {msg} JSON', errnote=f'Failed to download {msg} JSON', data=data, headers=headers, query=query, fatal=fatal) @@ -137,12 +143,13 @@ def decrypt(data): }, query=query, fatal=fatal) return api_json, decrypt - def _download_metadata(self, url, video_id, lang, props_key): + def _download_metadata(self, url, video_id, lang, props_keys): metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False) if not metadata: webpage = self._download_webpage(url, video_id) nextjs_data = self._search_nextjs_data(webpage, video_id) - metadata = traverse_obj(nextjs_data, ('props', 'pageProps', props_key, {dict})) or {} + metadata = traverse_obj(nextjs_data, ( + 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {} return metadata def _get_formats(self, data, path, video_id=None): From 8776349ef6b1f644584a92dfa00a05208a48edc4 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sun, 2 Jul 2023 15:31:00 -0400 Subject: [PATCH 244/501] [extractor/vk] VKPlay, VKPlayLive: Add extractors (#7358) Closes #7107 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/vk.py | 139 ++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 76a7fef23e..6f1873383a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2272,6 +2272,8 @@ VKIE, VKUserVideosIE, VKWallPostIE, + VKPlayIE, + VKPlayLiveIE, ) from .vocaroo import VocarooIE from .vodlocker import VodlockerIE diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 16ca954f25..5753690283 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -11,11 +11,13 @@ from .youtube import YoutubeIE from ..utils import ( ExtractorError, + UserNotLive, clean_html, get_element_by_class, get_element_html_by_id, int_or_none, join_nonempty, + parse_resolution, str_or_none, str_to_int, try_call, @@ -25,6 +27,7 @@ url_or_none, urlencode_postdata, urljoin, + traverse_obj, ) @@ -701,3 +704,139 @@ def _real_extract(self, url): return self.playlist_result( entries, post_id, join_nonempty(uploader, f'Wall post {post_id}', delim=' - '), clean_html(get_element_by_class('wall_post_text', webpage))) + + +class VKPlayBaseIE(InfoExtractor): + _RESOLUTIONS = { + 'tiny': '256x144', + 'lowest': '426x240', + 'low': '640x360', + 'medium': '852x480', + 'high': '1280x720', + 'full_hd': '1920x1080', + 'quad_hd': '2560x1440', + } + + def _extract_from_initial_state(self, url, video_id, path): + webpage = self._download_webpage(url, video_id) + video_info = traverse_obj(self._search_json( + r'<script[^>]+\bid="initial-state"[^>]*>', webpage, 'initial state', video_id), + path, expected_type=dict) + if not video_info: + raise ExtractorError('Unable to extract video info from html inline initial state') + return video_info + + def _extract_formats(self, stream_info, video_id): + formats = [] + for stream in traverse_obj(stream_info, ( + 'data', 0, 'playerUrls', lambda _, v: url_or_none(v['url']) and v['type'])): + url = stream['url'] + format_id = str_or_none(stream['type']) + if format_id in ('hls', 'live_hls', 'live_playback_hls') or '.m3u8' in url: + formats.extend(self._extract_m3u8_formats(url, video_id, m3u8_id=format_id, fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats(url, video_id, mpd_id=format_id, fatal=False)) + elif format_id in ('live_dash', 'live_playback_dash'): + self.write_debug(f'Not extracting unsupported format "{format_id}"') + else: + formats.append({ + 'url': url, + 'ext': 'mp4', + 'format_id': format_id, + **parse_resolution(self._RESOLUTIONS.get(format_id)), + }) + return formats + + def _extract_common_meta(self, stream_info): + return traverse_obj(stream_info, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'release_timestamp': ('startTime', {int_or_none}), + 'thumbnail': ('previewUrl', {url_or_none}), + 'view_count': ('count', 'views', {int_or_none}), + 'like_count': ('count', 'likes', {int_or_none}), + 'categories': ('category', 'title', {str}, {lambda x: [x] if x else None}), + 'uploader': (('user', ('blog', 'owner')), 'nick', {str}), + 'uploader_id': (('user', ('blog', 'owner')), 'id', {str_or_none}), + 'duration': ('duration', {int_or_none}), + 'is_live': ('isOnline', {bool}), + 'concurrent_view_count': ('count', 'viewers', {int_or_none}), + }, get_all=False) + + +class VKPlayIE(VKPlayBaseIE): + _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/]+)/record/(?P<id>[a-f0-9\-]+)' + _TESTS = [{ + 'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da', + 'info_dict': { + 'id': 'f5e6e3b5-dc52-4d14-965d-0680dd2882da', + 'ext': 'mp4', + 'title': 'Atomic Heart (пробуем!) спасибо подписчику EKZO!', + 'uploader': 'ZitsmanN', + 'uploader_id': '13159830', + 'release_timestamp': 1683461378, + 'release_date': '20230507', + 'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+', + 'duration': 10608, + 'view_count': int, + 'like_count': int, + 'categories': ['Atomic Heart'], + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + username, video_id = self._match_valid_url(url).groups() + + record_info = traverse_obj(self._download_json( + f'https://api.vkplay.live/v1/blog/{username}/public_video_stream/record/{video_id}', video_id, fatal=False), + ('data', 'record', {dict})) + if not record_info: + record_info = self._extract_from_initial_state(url, video_id, ('record', 'currentRecord', 'data')) + + return { + **self._extract_common_meta(record_info), + 'id': video_id, + 'formats': self._extract_formats(record_info, video_id), + } + + +class VKPlayLiveIE(VKPlayBaseIE): + _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/]+)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://vkplay.live/bayda', + 'info_dict': { + 'id': 'f02c321e-427b-408d-b12f-ae34e53e0ea2', + 'ext': 'mp4', + 'title': r're:эскапизм крута .*', + 'uploader': 'Bayda', + 'uploader_id': 12279401, + 'release_timestamp': 1687209962, + 'release_date': '20230619', + 'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+', + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'categories': ['EVE Online'], + 'live_status': 'is_live', + }, + 'skip': 'livestream', + 'params': {'skip_download': True}, + }] + + def _real_extract(self, url): + username = self._match_id(url) + + stream_info = self._download_json( + f'https://api.vkplay.live/v1/blog/{username}/public_video_stream', username, fatal=False) + if not stream_info: + stream_info = self._extract_from_initial_state(url, username, ('stream', 'stream', 'data', 'stream')) + + formats = self._extract_formats(stream_info, username) + if not formats and not traverse_obj(stream_info, ('isOnline', {bool})): + raise UserNotLive(video_id=username) + + return { + **self._extract_common_meta(stream_info), + 'formats': formats, + } From 4dc4d8473c085900edc841c87c20041233d25b1f Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Mon, 3 Jul 2023 10:47:10 +0000 Subject: [PATCH 245/501] [extractor/youtube] Ignore incomplete data for comment threads by default (#7475) For both `--ignore-errors` and `--ignore-errors only_download`. Pass `--no-ignore-errors` to not ignore. Closes https://github.com/yt-dlp/yt-dlp/issues/7474 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 967914c0f7..2c64f8e845 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3426,7 +3426,9 @@ def extract_thread(contents): # Pinned comments may appear a second time in newest first sort # See: https://github.com/yt-dlp/yt-dlp/issues/6712 continue - self.report_warning('Detected YouTube comments looping. Stopping comment extraction as we probably cannot get any more.') + self.report_warning( + 'Detected YouTube comments looping. Stopping comment extraction ' + f'{"for this thread" if parent else ""} as we probably cannot get any more.') yield else: tracker['seen_comment_ids'].add(comment['id']) @@ -3517,12 +3519,18 @@ def extract_thread(contents): # Ignore incomplete data error for replies if retries didn't work. # This is to allow any other parent comments and comment threads to be downloaded. # See: https://github.com/yt-dlp/yt-dlp/issues/4669 - if 'incomplete data' in str(e).lower() and parent and self.get_param('ignoreerrors') is True: - self.report_warning( - 'Received incomplete data for a comment reply thread and retrying did not help. ' - 'Ignoring to let other comments be downloaded.') - else: - raise + if 'incomplete data' in str(e).lower() and parent: + if self.get_param('ignoreerrors') in (True, 'only_download'): + self.report_warning( + 'Received incomplete data for a comment reply thread and retrying did not help. ' + 'Ignoring to let other comments be downloaded. Pass --no-ignore-errors to not ignore.') + return + else: + raise ExtractorError( + 'Incomplete data received for comment reply thread. ' + 'Pass --ignore-errors to ignore and allow rest of comments to download.', + expected=True) + raise is_forced_continuation = False continuation = None for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): From 3b7f5300c577fef40464d46d4e4037a69d51fe82 Mon Sep 17 00:00:00 2001 From: RfadnjdExt <40250666+RfadnjdExt@users.noreply.github.com> Date: Wed, 5 Jul 2023 09:17:13 +0700 Subject: [PATCH 246/501] [extractor/googledrive] Fix source format extraction (#7395) Closes #7344 Authored by: RfadnjdExt --- yt_dlp/extractor/googledrive.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 9e2ccde005..8a4cd1690e 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -5,7 +5,9 @@ from ..utils import ( ExtractorError, determine_ext, + extract_attributes, get_element_by_class, + get_element_html_by_id, int_or_none, lowercase_escape, try_get, @@ -34,6 +36,7 @@ class GoogleDriveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, + 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', } }, { # video can't be watched anonymously due to view count limit reached, @@ -207,10 +210,10 @@ def get_value(key): 'export': 'download', }) - def request_source_file(source_url, kind): + def request_source_file(source_url, kind, data=None): return self._request_webpage( source_url, video_id, note='Requesting %s file' % kind, - errnote='Unable to request %s file' % kind, fatal=False) + errnote='Unable to request %s file' % kind, fatal=False, data=data) urlh = request_source_file(source_url, 'source') if urlh: def add_source_format(urlh): @@ -237,14 +240,10 @@ def add_source_format(urlh): urlh, url, video_id, note='Downloading confirmation page', errnote='Unable to confirm download', fatal=False) if confirmation_webpage: - confirm = self._search_regex( - r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', default=None) - if confirm: - confirmed_source_url = update_url_query(source_url, { - 'confirm': confirm, - }) - urlh = request_source_file(confirmed_source_url, 'confirmed source') + confirmed_source_url = extract_attributes( + get_element_html_by_id('download-form', confirmation_webpage) or '').get('action') + if confirmed_source_url: + urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) else: From 1cffd621cb371f1563563cfb2fe37d137e8a7bee Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 4 Jul 2023 22:05:52 -0500 Subject: [PATCH 247/501] [extractor/twitter:spaces] Fix extraction (#7512) Closes #7455 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 38 +++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index f854d9c4a4..1fb9524da6 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -889,8 +889,10 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:acce559345fd49f129c20dbcda3f1201', - 'timestamp': 1658407771464, + 'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad', + 'timestamp': 1658407771, + 'release_date': '20220721', + 'upload_date': '20220721', }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, @@ -1436,7 +1438,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': r're:Lucio Di Gaetano.*?', 'uploader_id': 'luciodigaetano', 'live_status': 'was_live', - 'timestamp': 1659877956397, + 'timestamp': 1659877956, + 'upload_date': '20220807', + 'release_timestamp': 1659904215, + 'release_date': '20220807', }, 'params': {'skip_download': 'm3u8'}, }] @@ -1482,26 +1487,31 @@ def _real_extract(self, url): metadata = space_data['metadata'] live_status = try_call(lambda: self.SPACE_STATUS[metadata['state'].lower()]) + is_live = live_status == 'is_live' formats = [] if live_status == 'is_upcoming': self.raise_no_formats('Twitter Space not started yet', expected=True) - elif live_status == 'post_live': - self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True) - else: - source = self._call_api( - f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key'])['source'] - - # XXX: Native downloader does not work + elif not is_live and not metadata.get('is_space_available_for_replay'): + self.raise_no_formats('Twitter Space ended and replay is disabled', expected=True) + elif metadata.get('media_key'): + source = traverse_obj( + self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']), + ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) formats = self._extract_m3u8_formats( - traverse_obj(source, 'noRedirectPlaybackUrl', 'location'), - metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live', - headers={'Referer': 'https://twitter.com/'}) + source, metadata['media_key'], 'm4a', live=is_live, fatal=False, + headers={'Referer': 'https://twitter.com/'}) if source else [] for fmt in formats: fmt.update({'vcodec': 'none', 'acodec': 'aac'}) + if not is_live: + fmt['container'] = 'm4a_dash' participants = ', '.join(traverse_obj( space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet' + + if not formats and live_status == 'post_live': + self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True) + return { 'id': space_id, 'title': metadata.get('title'), @@ -1513,7 +1523,7 @@ def _real_extract(self, url): 'live_status': live_status, 'release_timestamp': try_call( lambda: int_or_none(metadata['scheduled_start'], scale=1000)), - 'timestamp': metadata.get('created_at'), + 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, } From 49296437a8e5fa91dacb5446e51ab588474c85d3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 5 Jul 2023 11:27:36 -0500 Subject: [PATCH 248/501] [extractor/twitter] Fix unauthenticated extraction (#7476) Closes #7473 Authored by: bashonly --- README.md | 3 - yt_dlp/extractor/twitter.py | 180 ++++++++++++++---------------------- 2 files changed, 70 insertions(+), 113 deletions(-) diff --git a/README.md b/README.md index 066ff90528..4fb3e450d8 100644 --- a/README.md +++ b/README.md @@ -1852,9 +1852,6 @@ #### tiktok #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` -#### twitter -* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed - #### stacommu, wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 1fb9524da6..eaf9be5268 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,5 +1,6 @@ import json import re +import urllib.error from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE @@ -34,7 +35,6 @@ class TwitterBaseIE(InfoExtractor): _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} - _guest_token = None _flow_token = None _LOGIN_INIT_DATA = json.dumps({ @@ -145,14 +145,6 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _fetch_guest_token(self, headers, display_id): - headers.pop('x-guest-token', None) - self._guest_token = traverse_obj(self._download_json( - f'{self._API_BASE}guest/activate.json', display_id, - 'Downloading guest token', data=b'', headers=headers), 'guest_token') - if not self._guest_token: - raise ExtractorError('Could not retrieve guest token') - def _set_base_headers(self): headers = self._AUTH.copy() csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) @@ -183,12 +175,15 @@ def _perform_login(self, username, password): if self.is_logged_in: return - self._request_webpage('https://twitter.com/', None, 'Requesting cookies') + webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') headers = self._set_base_headers() - self._fetch_guest_token(headers, None) + guest_token = self._search_regex( + r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._download_json( + f'{self._API_BASE}guest/activate.json', None, 'Downloading guest token', + data=b'', headers=headers)['guest_token'] headers.update({ 'content-type': 'application/json', - 'x-guest-token': self._guest_token, + 'x-guest-token': guest_token, 'x-twitter-client-language': 'en', 'x-twitter-active-user': 'yes', 'Referer': 'https://twitter.com/', @@ -285,37 +280,24 @@ def input_dict(subtask_id, text): self.report_login() def _call_api(self, path, video_id, query={}, graphql=False): - headers = self._set_base_headers() - if self.is_logged_in: - headers.update({ + if not self.is_logged_in: + self.raise_login_required() + + result = self._download_json( + (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, video_id, + f'Downloading {"GraphQL" if graphql else "legacy API"} JSON', headers={ + **self._set_base_headers(), 'x-twitter-auth-type': 'OAuth2Session', 'x-twitter-client-language': 'en', 'x-twitter-active-user': 'yes', - }) + }, query=query, expected_status={400, 401, 403, 404} if graphql else {403}) - for first_attempt in (True, False): - if not self.is_logged_in: - if not self._guest_token: - self._fetch_guest_token(headers, video_id) - headers['x-guest-token'] = self._guest_token + if result.get('errors'): + errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) + raise ExtractorError( + f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) - allowed_status = {400, 401, 403, 404} if graphql else {403} - result = self._download_json( - (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, - video_id, headers=headers, query=query, expected_status=allowed_status, - note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') - - if result.get('errors'): - errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) - if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower(): - self.to_screen('Guest token has expired. Refreshing guest token') - self._guest_token = None - continue - - raise ExtractorError( - f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) - - return result + return result def _build_graphql_query(self, media_id): raise NotImplementedError('Method must be implemented to support GraphQL') @@ -457,6 +439,7 @@ class TwitterIE(TwitterBaseIE): _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/(?:video|photo)/(?P<index>\d+))?' _TESTS = [{ + # comment_count, repost_count, view_count are only available with auth (applies to all tests) 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'info_dict': { 'id': '643211870443208704', @@ -471,10 +454,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1442188653, 'upload_date': '20150913', 'uploader_url': 'https://twitter.com/freethenipple', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 18, }, @@ -505,8 +485,6 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1447395772, 'upload_date': '20151113', 'uploader_url': 'https://twitter.com/starwars', - 'comment_count': int, - 'repost_count': int, 'like_count': int, 'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'age_limit': 0, @@ -550,10 +528,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1455777459, 'upload_date': '20160218', 'uploader_url': 'https://twitter.com/jaydingeer', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, }, @@ -591,10 +566,7 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20160412', 'uploader_url': 'https://twitter.com/CaptainAmerica', 'thumbnail': r're:^https?://.*\.jpg', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -641,10 +613,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1505803395, 'upload_date': '20170919', 'uploader_url': 'https://twitter.com/Prefet971', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, }, @@ -667,10 +636,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1527623489, 'upload_date': '20180529', 'uploader_url': 'https://twitter.com/LisPower1', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -692,10 +658,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1548184644, 'upload_date': '20190122', 'uploader_url': 'https://twitter.com/Twitter', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -713,6 +676,7 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], + 'skip': 'Requires authentication', }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', @@ -729,8 +693,6 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1610651040, 'upload_date': '20210114', 'uploader_url': 'https://twitter.com/BrooklynNets', - 'comment_count': int, - 'repost_count': int, 'like_count': int, 'tags': [], 'age_limit': 0, @@ -753,10 +715,7 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', 'duration': 30.03, 'timestamp': 1665025050, - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -765,15 +724,13 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima | #\u0432\u029f\u043c - Test', + 'title': 'Ultima📛 | #вʟм - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima | #\u0432\u029f\u043c', + 'uploader': 'Ultima📛 | #вʟм', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', 'timestamp': 1664992565, - 'comment_count': int, - 'repost_count': int, 'like_count': int, 'tags': [], 'age_limit': 0, @@ -795,10 +752,7 @@ class TwitterIE(TwitterBaseIE): 'duration': 21.321, 'timestamp': 1664477766, 'upload_date': '20220929', - 'comment_count': int, - 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, }, @@ -825,6 +779,20 @@ class TwitterIE(TwitterBaseIE): }, 'skip': 'Requires authentication', }, { + # Single Vimeo video result without auth + 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', + 'info_dict': { + 'id': '551578322', + 'ext': 'mp4', + 'title': 'Dusty & The Mayor', + 'uploader': 'Michael Chau', + 'uploader_id': 'user29061007', + 'uploader_url': 'https://vimeo.com/user29061007', + 'duration': 478, + 'thumbnail': 'https://i.vimeocdn.com/video/1139658575-0dfdce6e9a2401fe09feb24bf0d14e6f24a53c12f447ff688ace61009ad4c1ba-d_1280', + }, + }, { + # Playlist result only with auth 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -842,6 +810,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/Srirachachau', 'timestamp': 1621447860, }, + 'skip': 'Requires authentication', }, { 'url': 'https://twitter.com/DavidToons_/status/1578353380363501568', 'playlist_mincount': 2, @@ -860,6 +829,7 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20221007', 'age_limit': 0, }, + 'skip': 'Requires authentication', }, { 'url': 'https://twitter.com/primevideouk/status/1578401165338976258', 'playlist_count': 2, @@ -873,8 +843,6 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20221007', 'age_limit': 0, 'uploader_url': 'https://twitter.com/primevideouk', - 'comment_count': int, - 'repost_count': int, 'like_count': int, 'tags': ['TheRingsOfPower'], }, @@ -896,6 +864,7 @@ class TwitterIE(TwitterBaseIE): }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, + 'skip': 'Requires authentication', }, { # URL specifies video number but --yes-playlist 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -905,9 +874,7 @@ class TwitterIE(TwitterBaseIE): 'title': 'md5:be05989b0722e114103ed3851a0ffae2', 'timestamp': 1670459604.0, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', - 'comment_count': int, 'uploader_id': 'CTVJLaidlaw', - 'repost_count': int, 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], 'upload_date': '20221208', 'age_limit': 0, @@ -926,14 +893,11 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670459604.0, 'uploader_id': 'CTVJLaidlaw', 'uploader': 'Jocelyn Laidlaw', - 'repost_count': int, - 'comment_count': int, 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], 'duration': 102.226, 'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, - 'view_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'upload_date': '20221208', 'age_limit': 0, @@ -959,9 +923,6 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 18, 'tags': [], 'like_count': int, - 'repost_count': int, - 'comment_count': int, - 'view_count': int, }, }, { 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -974,10 +935,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'uploader_id': 'hlo_again', 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1599108643743473680/pu/img/UG3xjov4rgg5sbYM.jpg?name=orig', - 'repost_count': int, 'duration': 9.531, - 'comment_count': int, - 'view_count': int, 'upload_date': '20221203', 'age_limit': 0, 'timestamp': 1670092210.0, @@ -994,14 +952,11 @@ class TwitterIE(TwitterBaseIE): 'ext': 'mp4', 'uploader_url': 'https://twitter.com/MunTheShinobi', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', - 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, 'uploader': 'Mün The Shinobi', - 'repost_count': int, 'upload_date': '20221206', 'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', - 'comment_count': int, 'like_count': int, 'tags': [], 'uploader_id': 'MunTheShinobi', @@ -1009,14 +964,14 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670306984.0, }, }, { - # url to retweet id, legacy API + # url to retweet id 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', 'display_id': '1623739803874349067', 'ext': 'mp4', 'title': 'Johnny Bullets - Me after going viral to over 30million people: Whoopsie-daisy', - 'description': 'md5:e873616a4a8fe0f93e71872678a672f3', + 'description': 'md5:224d62f54b0cdef8e33d4c56c41ac503', 'uploader': 'Johnny Bullets', 'uploader_id': 'Johnnybull3ts', 'uploader_url': 'https://twitter.com/Johnnybull3ts', @@ -1027,10 +982,7 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20230208', 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', 'like_count': int, - 'repost_count': int, - 'comment_count': int, }, - 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1081,8 +1033,6 @@ def _graphql_to_legacy(self, data, twid): if 'tombstone' in result: cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') - if cause and 'adult content' in cause: - self.raise_login_required(cause) raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) status = result.get('legacy', {}) @@ -1138,19 +1088,22 @@ def _build_graphql_query(self, media_id): def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') - if self._configuration_arg('legacy_api') and not self.is_logged_in: - status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { - 'cards_platform': 'Web-12', - 'include_cards': 1, - 'include_reply_count': 1, - 'include_user_entities': 0, - 'tweet_mode': 'extended', - }), 'retweeted_status', None) + if not self.is_logged_in: + try: + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={'id': twid}) + self.to_screen(f'Some metadata is missing without authentication. {self._login_hint()}') + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + self.raise_login_required('Requested tweet may only be available when logged in') + raise else: - result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) - status = self._graphql_to_legacy(result, twid) + status = self._graphql_to_legacy( + self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) - title = description = status['full_text'].replace('\n', ' ') + title = description = traverse_obj( + status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) user = status.get('user') or {} @@ -1176,12 +1129,16 @@ def _real_extract(self, url): def extract_from_video_info(media): media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) + if not media_id: + # workaround for non-authenticated responses + media_id = traverse_obj(media, ( + 'video_info', 'variants', ..., 'url', + {lambda x: re.search(r'_video/(\d+)/', x)[1]}), get_all=False) self.write_debug(f'Extracting from video info: {media_id}') - video_info = media.get('video_info') or {} formats = [] subtitles = {} - for variant in video_info.get('variants', []): + for variant in traverse_obj(media, ('video_info', 'variants', ...)): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) @@ -1201,12 +1158,12 @@ def add_thumbnail(name, size): add_thumbnail('orig', media.get('original_info') or {}) return { - 'id': media_id, + 'id': media_id or twid, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), - 'duration': float_or_none(video_info.get('duration_millis'), 1000), + 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), # The codec of http formats are unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'), } @@ -1286,12 +1243,15 @@ def get_binding_value(k): } videos = traverse_obj(status, ( - (None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo', {dict})) + ('mediaDetails', ((None, 'quoted_status'), 'extended_entities', 'media')), + lambda _, m: m['type'] != 'photo', {dict})) if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'): selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card'))) else: - desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict})) + desired_obj = traverse_obj(status, ( + ('mediaDetails', ((None, 'quoted_status'), 'extended_entities', 'media')), + int(selected_index) - 1, {dict}), get_all=False) if not desired_obj: raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True) elif desired_obj.get('type') != 'video': From 90db9a3c00ca80492c6a58c542e4cbf4c2710866 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 7 Jul 2023 01:32:41 +1200 Subject: [PATCH 249/501] [extractor/youtube:stories] Remove (#7459) YouTube killed them https://web.archive.org/web/20230630153050/https://support.google.com/youtube/thread/217640760 --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/youtube.py | 43 +-------------------------------- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6f1873383a..c0a330dbe5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -15,7 +15,6 @@ YoutubeSearchURLIE, YoutubeMusicSearchURLIE, YoutubeSubscriptionsIE, - YoutubeStoriesIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeYtBeIE, diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2c64f8e845..552ca099c4 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2499,29 +2499,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@abaointokyo', }, 'params': {'skip_download': True} - }, { - # Story. Requires specific player params to work. - 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI', - 'info_dict': { - 'id': 'vv8qTUWmulI', - 'ext': 'mp4', - 'availability': 'unlisted', - 'view_count': int, - 'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA', - 'upload_date': '20220526', - 'categories': ['Education'], - 'title': 'Story', - 'channel': 'IT\'S HISTORY', - 'description': '', - 'duration': 12, - 'playable_in_embed': True, - 'age_limit': 0, - 'live_status': 'not_live', - 'tags': [], - 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp', - 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA', - }, - 'skip': 'stories get removed after some period of time', }, { 'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA', 'info_dict': { @@ -3620,7 +3597,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, } - if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': + if _split_innertube_client(client)[0] == 'android': yt_query['params'] = self._PLAYER_PARAMS yt_query.update(self._generate_player_context(sts)) @@ -4033,8 +4010,6 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - if smuggled_data.get('is_story'): # XXX: Deprecated - query['pp'] = self._PLAYER_PARAMS webpage = self._download_webpage( webpage_url, video_id, fatal=False, query=query) @@ -7145,22 +7120,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): }] -class YoutubeStoriesIE(InfoExtractor): - IE_DESC = 'YouTube channel stories; "ytstories:" prefix' - IE_NAME = 'youtube:stories' - _VALID_URL = r'ytstories:UC(?P<id>[A-Za-z0-9_-]{21}[AQgw])$' - _TESTS = [{ - 'url': 'ytstories:UCwFCb4jeqaKWnciAYM-ZVHg', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = f'RLTD{self._match_id(url)}' - return self.url_result( - smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}), - ie=YoutubeTabIE, video_id=playlist_id) - - class YoutubeShortsAudioPivotIE(InfoExtractor): IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' IE_NAME = 'youtube:shorts:pivot:audio' From 6355b5f1e1e8e7f4ef866d71d51e03baf0e82f17 Mon Sep 17 00:00:00 2001 From: Jorge <46056498+jorgectf@users.noreply.github.com> Date: Thu, 6 Jul 2023 16:51:46 +0200 Subject: [PATCH 250/501] [misc] Add CodeQL workflow (#7497) --- .github/workflows/codeql.yml | 65 ++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..2821d90d06 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,65 @@ +name: "CodeQL" + +on: + push: + branches: [ 'master', 'gh-pages', 'release' ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ 'master' ] + schedule: + - cron: '59 11 * * 5' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Use only 'java' to analyze code written in Java, Kotlin or both + # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" From 662ef1e910b72e57957f06589925b2332ba52821 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 4 Jul 2023 18:46:32 +0530 Subject: [PATCH 251/501] [downloader/http] Avoid infinite loop when no data is received Closes #7504 --- yt_dlp/downloader/http.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index e785f0d4ed..7c5daea859 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -339,15 +339,15 @@ def retry(e): elif speed: ctx.throttle_start = None - if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: - ctx.resume_len = byte_counter - # ctx.block_size = block_size - raise NextFragment() - if ctx.stream is None: self.to_stderr('\n') self.report_error('Did not get any data blocks') return False + + if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: + ctx.resume_len = byte_counter + raise NextFragment() + if ctx.tmpfilename != '-': ctx.stream.close() From 47bcd437247152e0af5b3ebc5592db7bb66855c2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 18:08:44 +0530 Subject: [PATCH 252/501] [outtmpl] Pad `playlist_index` etc even when with internal formatting Closes #7501 --- test/test_YoutubeDL.py | 2 +- yt_dlp/YoutubeDL.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f495fa6d90..3fbcdd01f3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -684,7 +684,7 @@ def test(tmpl, expected, *, info=None, **params): test('%(id)s.%(ext)s', '1234.mp4') test('%(duration_string)s', ('27:46:40', '27-46-40')) test('%(resolution)s', '1080p') - test('%(playlist_index)s', '001') + test('%(playlist_index|)s', '001') test('%(playlist_autonumber)s', '02') test('%(autonumber)s', '00001') test('%(autonumber+2)03d', '005', autonumber_start=3) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 6dade0b2a4..d4aff0743e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1271,21 +1271,20 @@ def create_key(outer_mobj): return outer_mobj.group(0) key = outer_mobj.group('key') mobj = re.match(INTERNAL_FORMAT_RE, key) - initial_field = mobj.group('fields') if mobj else '' - value, replacement, default = None, None, na + value, replacement, default, last_field = None, None, na, '' while mobj: mobj = mobj.groupdict() default = mobj['default'] if mobj['default'] is not None else default value = get_value(mobj) - replacement = mobj['replacement'] + last_field, replacement = mobj['fields'], mobj['replacement'] if value is None and mobj['alternate']: mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:]) else: break fmt = outer_mobj.group('format') - if fmt == 's' and value is not None and key in field_size_compat_map.keys(): - fmt = f'0{field_size_compat_map[key]:d}d' + if fmt == 's' and value is not None and last_field in field_size_compat_map.keys(): + fmt = f'0{field_size_compat_map[last_field]:d}d' if None not in (value, replacement): try: @@ -1322,7 +1321,7 @@ def create_key(outer_mobj): value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s', factor=1024 if '#' in flags else 1000) elif fmt[-1] == 'S': # filename sanitization - value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt + value, fmt = filename_sanitizer(last_field, value, restricted='#' in flags), str_fmt elif fmt[-1] == 'c': if value: value = str(value)[0] @@ -1341,7 +1340,7 @@ def create_key(outer_mobj): elif fmt[-1] == 'a': value, fmt = ascii(value), str_fmt if fmt[-1] in 'csra': - value = sanitizer(initial_field, value) + value = sanitizer(last_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value From fa44802809d189fca0f4782263d48d6533384503 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 17:34:51 +0530 Subject: [PATCH 253/501] [devscripts/make_changelog] Skip reverted commits --- devscripts/make_changelog.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 0bcfa6ae72..eb0e3082f9 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -252,6 +252,7 @@ class CommitRange: (?:\ \((?P<issues>\#\d+(?:,\ \#\d+)*)\))? ''', re.VERBOSE | re.DOTALL) EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) + REVERT_RE = re.compile(r'(?i:Revert)\s+([\da-f]{40})') FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert)\s+([\da-f]{40})') UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') @@ -279,7 +280,7 @@ def _get_commits_and_fixes(self, default_author): self.COMMAND, 'log', f'--format=%H%n%s%n%b%n{self.COMMIT_SEPARATOR}', f'{self._start}..{self._end}' if self._start else self._end).stdout - commits = {} + commits, reverts = {}, {} fixes = defaultdict(list) lines = iter(result.splitlines(False)) for i, commit_hash in enumerate(lines): @@ -300,6 +301,11 @@ def _get_commits_and_fixes(self, default_author): logger.debug(f'Reached Release commit, breaking: {commit}') break + revert_match = self.REVERT_RE.fullmatch(commit.short) + if revert_match: + reverts[revert_match.group(1)] = commit + continue + fix_match = self.FIXES_RE.search(commit.short) if fix_match: commitish = fix_match.group(1) @@ -307,6 +313,13 @@ def _get_commits_and_fixes(self, default_author): commits[commit.hash] = commit + for commitish, revert_commit in reverts.items(): + reverted = commits.pop(commitish, None) + if reverted: + logger.debug(f'{commit} fully reverted {reverted}') + else: + commits[revert_commit.hash] = revert_commit + for commitish, fix_commits in fixes.items(): if commitish in commits: hashes = ', '.join(commit.hash[:HASH_LENGTH] for commit in fix_commits) From 337734d4a8a6500bc65434843db346b5cbd05e81 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 20:09:42 +0530 Subject: [PATCH 254/501] [cleanup] Misc --- devscripts/make_changelog.py | 7 ++++--- setup.cfg | 1 - yt_dlp/YoutubeDL.py | 2 +- yt_dlp/downloader/common.py | 3 ++- yt_dlp/downloader/fragment.py | 4 +--- yt_dlp/extractor/adobepass.py | 2 +- yt_dlp/extractor/iqiyi.py | 2 +- yt_dlp/extractor/vshare.py | 2 +- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/utils/__init__.py | 1 + 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index eb0e3082f9..3ad4c5408b 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -55,6 +55,7 @@ def commit_lookup(cls): 'dependencies', 'jsinterp', 'outtmpl', + 'formats', 'plugins', 'update', 'upstream', @@ -68,9 +69,9 @@ def commit_lookup(cls): 'misc', 'test', }, - cls.EXTRACTOR: {'extractor'}, - cls.DOWNLOADER: {'downloader'}, - cls.POSTPROCESSOR: {'postprocessor'}, + cls.EXTRACTOR: {'extractor', 'ie'}, + cls.DOWNLOADER: {'downloader', 'fd'}, + cls.POSTPROCESSOR: {'postprocessor', 'pp'}, }.items() for name in names } diff --git a/setup.cfg b/setup.cfg index 68d9e516d1..6deaa79715 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,7 +8,6 @@ ignore = E402,E501,E731,E741,W503 max_line_length = 120 per_file_ignores = devscripts/lazy_load_template.py: F401 - yt_dlp/utils/__init__.py: F401, F403 [autoflake] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index d4aff0743e..448a15bc95 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3687,7 +3687,7 @@ def render_formats_table(self, info_dict): def simplified_codec(f, field): assert field in ('acodec', 'vcodec') - codec = f.get(field, 'unknown') + codec = f.get(field) if not codec: return 'unknown' elif codec != 'none': diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index a0219a3509..8fe9d99930 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -255,7 +255,8 @@ def sanitize_open(self, filename, open_mode): @wrap_file_access('remove') def try_remove(self, filename): - os.remove(filename) + if os.path.isfile(filename): + os.remove(filename) @wrap_file_access('rename') def try_rename(self, old_filename, new_filename): diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 458167216c..0698153269 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -300,9 +300,7 @@ def frag_progress_hook(s): def _finish_frag_download(self, ctx, info_dict): ctx['dest_stream'].close() if self.__do_ytdl_file(ctx): - ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) - if os.path.isfile(ytdl_filename): - self.try_remove(ytdl_filename) + self.try_remove(self.ytdl_filename(ctx['filename'])) elapsed = time.time() - ctx['started'] to_file = ctx['tmpfilename'] != '-' diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index 68a970f68c..722a534ed6 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1473,7 +1473,7 @@ def extract_redirect_url(html, url=None, fatal=False): elif 'automatically signed in with' in provider_redirect_page: # Seems like comcast is rolling up new way of automatically signing customers oauth_redirect_url = self._html_search_regex( - r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + r'continue:\s*"(https://oauth\.xfinity\.com/oauth/authorize\?.+)"', provider_redirect_page, 'oauth redirect (signed)') # Just need to process the request. No useful data comes back self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index ebf49e8359..fa602ba887 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -527,7 +527,7 @@ def _extract_vms_player_js(self, webpage, video_id): if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( - r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) + r'<script src="((?:https?:)?//stc\.iqiyipic\.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS') webpack_map = self._search_json( r'["\']\s*\+\s*', webpack_js, 'JS locations', video_id, diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py index 1bc7ae4ba1..443ed43cc4 100644 --- a/yt_dlp/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py @@ -22,7 +22,7 @@ def _extract_packed(self, webpage): packed = self._search_regex( r'(eval\(function.+)', webpage, 'packed code') unpacked = decode_packed_codes(packed) - digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits') + digits = self._search_regex(r'\[([\d,]+)\]', unpacked, 'digits') digits = [int(digit) for digit in digits.split(',')] key_digit = self._search_regex( r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit') diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 552ca099c4..2a8106b45c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3117,7 +3117,7 @@ def _extract_n_function_name(self, jscode): return funcname return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])[,;]', jscode, + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] def _extract_n_function_code(self, video_id, player_url): diff --git a/yt_dlp/utils/__init__.py b/yt_dlp/utils/__init__.py index 74b39e2c7b..2dd20ada25 100644 --- a/yt_dlp/utils/__init__.py +++ b/yt_dlp/utils/__init__.py @@ -1,3 +1,4 @@ +# flake8: noqa: F401, F403 import warnings from ..compat.compat_utils import passthrough_module From 906c0bdcd8974340d619e99ccd613c163eb0d0c2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 18:17:42 +0530 Subject: [PATCH 255/501] [formats] Fix best fallback for storyboards Partial fix for #7478 --- yt_dlp/YoutubeDL.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 448a15bc95..2c5014f870 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2766,11 +2766,8 @@ def is_wellformed(f): formats_to_download = list(format_selector({ 'formats': formats, 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), - 'incomplete_formats': ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)), + 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video + or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio })) if interactive_format_selection and not formats_to_download: self.report_error('Requested format is not available', tb=False, is_error=False) From bc344cd456380999c1ee74554dfd432a38f32ec7 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 18:39:50 +0530 Subject: [PATCH 256/501] [core] Allow extractors to mark formats as potentially DRM (#7396) This is useful for HLS where detecting whether the format is actually DRM requires the child manifest to be downloaded. Makes the error message when using `--test` inconsistent, but doesn't really matter. --- yt_dlp/YoutubeDL.py | 37 ++++++++++++++++++++++--------------- yt_dlp/downloader/hls.py | 30 ++++++++++++++++++++---------- yt_dlp/extractor/common.py | 10 ++++------ 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 2c5014f870..cf0122d4ba 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -983,6 +983,7 @@ def trouble(self, message=None, tb=None, is_error=True): ID='green', DELIM='blue', ERROR='red', + BAD_FORMAT='light red', WARNING='yellow', SUPPRESS='light black', ) @@ -2085,8 +2086,6 @@ def syntax_error(note, start): allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), 'video': self.params.get('allow_multiple_video_streams', False)} - check_formats = self.params.get('check_formats') == 'selected' - def _parse_filter(tokens): filter_parts = [] for type, string_, start, _, _ in tokens: @@ -2259,10 +2258,19 @@ def _merge(formats_pair): return new_dict def _check_formats(formats): - if not check_formats: + if (self.params.get('check_formats') is not None + or self.params.get('allow_unplayable_formats')): yield from formats return - yield from self._check_formats(formats) + elif self.params.get('check_formats') == 'selected': + yield from self._check_formats(formats) + return + + for f in formats: + if f.get('has_drm'): + yield from self._check_formats([f]) + else: + yield f def _build_selector_function(selector): if isinstance(selector, list): # , @@ -2614,10 +2622,10 @@ def sanitize_numeric_fields(info): if field_preference: info_dict['_format_sort_fields'] = field_preference - # or None ensures --clean-infojson removes it - info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None + info_dict['_has_drm'] = any( # or None ensures --clean-infojson removes it + f.get('has_drm') and f['has_drm'] != 'maybe' for f in formats) or None if not self.params.get('allow_unplayable_formats'): - formats = [f for f in formats if not f.get('has_drm')] + formats = [f for f in formats if not f.get('has_drm') or f['has_drm'] == 'maybe'] if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats): self.report_warning( @@ -3719,14 +3727,13 @@ def simplified_codec(f, field): simplified_codec(f, 'acodec'), format_field(f, 'abr', '\t%dk', func=round), format_field(f, 'asr', '\t%s', func=format_decimal_suffix), - join_nonempty( - self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, - self._format_out('DRM', 'light red') if f.get('has_drm') else None, - format_field(f, 'language', '[%s]'), - join_nonempty(format_field(f, 'format_note'), - format_field(f, 'container', ignore=(None, f.get('ext'))), - delim=', '), - delim=' '), + join_nonempty(format_field(f, 'language', '[%s]'), join_nonempty( + self._format_out('UNSUPPORTED', self.Styles.BAD_FORMAT) if f.get('ext') in ('f4f', 'f4m') else None, + (self._format_out('Maybe DRM', self.Styles.WARNING) if f.get('has_drm') == 'maybe' + else self._format_out('DRM', self.Styles.BAD_FORMAT) if f.get('has_drm') else None), + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + delim=', '), delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = self._list_format_headers( 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO', diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index f2868dc52b..ab7d496d42 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -28,7 +28,16 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' @staticmethod - def can_download(manifest, info_dict, allow_unplayable_formats=False): + def _has_drm(manifest): # TODO: https://github.com/yt-dlp/yt-dlp/pull/5039 + return bool(re.search('|'.join(( + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.apple\.streamingkeydelivery"', # Apple FairPlay + r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.microsoft\.playready"', # Microsoft PlayReady + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + )), manifest)) + + @classmethod + def can_download(cls, manifest, info_dict, allow_unplayable_formats=False): UNSUPPORTED_FEATURES = [ # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] @@ -50,13 +59,15 @@ def can_download(manifest, info_dict, allow_unplayable_formats=False): ] if not allow_unplayable_formats: UNSUPPORTED_FEATURES += [ - r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1], but not necessarily DRM ] def check_results(): yield not info_dict.get('is_live') for feature in UNSUPPORTED_FEATURES: yield not re.search(feature, manifest) + if not allow_unplayable_formats: + yield not cls._has_drm(manifest) return all(check_results()) def real_download(self, filename, info_dict): @@ -81,14 +92,13 @@ def real_download(self, filename, info_dict): message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') if not can_download: - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), s) - if has_drm and not self.params.get('allow_unplayable_formats'): - self.report_error( - 'This video is DRM protected; Try selecting another format with --format or ' - 'add --check-formats to automatically fallback to the next best format') + if self._has_drm(s) and not self.params.get('allow_unplayable_formats'): + if info_dict.get('has_drm') and self.params.get('test'): + self.to_screen(f'[{self.FD_NAME}] This format is DRM protected', skip_eol=True) + else: + self.report_error( + 'This format is DRM protected; Try selecting another format with --format or ' + 'add --check-formats to automatically fallback to the next best format', tb=False) return False message = message or 'Unsupported features have been detected' fd = FFmpegFD(self.ydl, self.params) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 3f7dcb82bb..fe08839aaa 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -26,6 +26,7 @@ from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD from ..utils import ( IDENTITY, JSON_LD_RE, @@ -224,7 +225,8 @@ class InfoExtractor: width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. - * has_drm The format has DRM and cannot be downloaded. Boolean + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. * extra_param_to_segment_url A query string to append to each fragment's URL, or to update each existing query string with. Only applied by the native HLS/DASH downloaders. @@ -1979,11 +1981,7 @@ def _parse_m3u8_formats_and_subtitles( errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), m3u8_doc) + has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) From 94ed638a437fc766699d440e978982e24ce6a30a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 23 Jun 2023 18:16:07 +0530 Subject: [PATCH 257/501] [ie/youtube] Avoid false DRM detection (#7396) Some master manifests contain a mix of DRM and non-DRM formats --- yt_dlp/extractor/youtube.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2a8106b45c..73bfa662d2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3927,9 +3927,12 @@ def process_manifest_format(f, proto, client_name, itag): elif itag: f['format_id'] = itag + if f.get('source_preference') is None: + f['source_preference'] = -1 + if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') - f['source_preference'] = (f.get('source_preference') or -1) + 100 + f['source_preference'] += 100 f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): @@ -3938,6 +3941,10 @@ def process_manifest_format(f, proto, client_name, itag): f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] + + if proto == 'hls' and f.get('has_drm'): + f['has_drm'] = 'maybe' + f['source_preference'] -= 5 return True subtitles = {} @@ -4037,6 +4044,10 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, else None) streaming_data = traverse_obj(player_responses, (..., 'streamingData')) *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) + if all(f.get('has_drm') for f in formats): + # If there are no formats that definitely don't have DRM, all have DRM + for f in formats: + f['has_drm'] = True return live_broadcast_details, live_status, streaming_data, formats, subtitles From ad8902f616ad2541f9b9626738f1393fad89a64c Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 19:35:49 +0530 Subject: [PATCH 258/501] [ie/vidlii] Handle relative URLs Closes #7480 --- yt_dlp/extractor/vidlii.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py index 5933783ae6..cde4274d9c 100644 --- a/yt_dlp/extractor/vidlii.py +++ b/yt_dlp/extractor/vidlii.py @@ -70,6 +70,7 @@ def _real_extract(self, url): r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage) or []] for source in sources: + source = urljoin(url, source) height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360)) if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False): formats.append({ From 1ceb657bdd254ad961489e5060f2ccc7d556b729 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Wed, 5 Jul 2023 15:16:28 -0500 Subject: [PATCH 259/501] [fd/external] Scope cookies - ffmpeg: Calculate cookies from cookiejar and pass with `-cookies` arg instead of `-headers` - aria2c, curl, wget: Write cookiejar to file and use external FD built-in cookiejar support - httpie: Calculate cookies from cookiejar instead of `http_headers` - axel: Calculate cookies from cookiejar and disable http redirection if cookies are passed - May break redirects, but axel simply don't have proper cookie support Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj Authored by: bashonly, coletdjnz --- test/test_downloader_external.py | 133 +++++++++++++++++++++++++++++++ yt_dlp/cookies.py | 7 ++ yt_dlp/downloader/external.py | 41 +++++++++- 3 files changed, 179 insertions(+), 2 deletions(-) create mode 100644 test/test_downloader_external.py diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py new file mode 100644 index 0000000000..e5b02ba5a4 --- /dev/null +++ b/test/test_downloader_external.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys +import unittest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import http.cookiejar + +from test.helper import FakeYDL +from yt_dlp.downloader.external import ( + Aria2cFD, + AxelFD, + CurlFD, + FFmpegFD, + HttpieFD, + WgetFD, +) + +TEST_COOKIE = { + 'version': 0, + 'name': 'test', + 'value': 'ytdlp', + 'port': None, + 'port_specified': False, + 'domain': '.example.com', + 'domain_specified': True, + 'domain_initial_dot': False, + 'path': '/', + 'path_specified': True, + 'secure': False, + 'expires': None, + 'discard': False, + 'comment': None, + 'comment_url': None, + 'rest': {}, +} + +TEST_INFO = {'url': 'http://www.example.com/'} + + +class TestHttpieFD(unittest.TestCase): + def test_make_cmd(self): + with FakeYDL() as ydl: + downloader = HttpieFD(ydl, {}) + self.assertEqual( + downloader._make_cmd('test', TEST_INFO), + ['http', '--download', '--output', 'test', 'http://www.example.com/']) + + # Test cookie header is added + ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) + self.assertEqual( + downloader._make_cmd('test', TEST_INFO), + ['http', '--download', '--output', 'test', 'http://www.example.com/', 'Cookie:test=ytdlp']) + + +class TestAxelFD(unittest.TestCase): + def test_make_cmd(self): + with FakeYDL() as ydl: + downloader = AxelFD(ydl, {}) + self.assertEqual( + downloader._make_cmd('test', TEST_INFO), + ['axel', '-o', 'test', '--', 'http://www.example.com/']) + + # Test cookie header is added + ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) + self.assertEqual( + downloader._make_cmd('test', TEST_INFO), + ['axel', '-o', 'test', 'Cookie: test=ytdlp', '--max-redirect=0', '--', 'http://www.example.com/']) + + +class TestWgetFD(unittest.TestCase): + def test_make_cmd(self): + with FakeYDL() as ydl: + downloader = WgetFD(ydl, {}) + self.assertNotIn('--load-cookies', downloader._make_cmd('test', TEST_INFO)) + # Test cookiejar tempfile arg is added + ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) + self.assertIn('--load-cookies', downloader._make_cmd('test', TEST_INFO)) + + +class TestCurlFD(unittest.TestCase): + def test_make_cmd(self): + with FakeYDL() as ydl: + downloader = CurlFD(ydl, {}) + self.assertNotIn('--cookie-jar', downloader._make_cmd('test', TEST_INFO)) + # Test cookiejar tempfile arg is added + ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) + self.assertIn('--cookie-jar', downloader._make_cmd('test', TEST_INFO)) + + +class TestAria2cFD(unittest.TestCase): + def test_make_cmd(self): + with FakeYDL() as ydl: + downloader = Aria2cFD(ydl, {}) + downloader._make_cmd('test', TEST_INFO) + self.assertFalse(hasattr(downloader, '_cookies_tempfile')) + + # Test cookiejar tempfile arg is added + ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) + cmd = downloader._make_cmd('test', TEST_INFO) + self.assertIn(f'--load-cookies={downloader._cookies_tempfile}', cmd) + + +@unittest.skipUnless(FFmpegFD.available(), 'ffmpeg not found') +class TestFFmpegFD(unittest.TestCase): + _args = [] + + def _test_cmd(self, args): + self._args = args + + def test_make_cmd(self): + with FakeYDL() as ydl: + downloader = FFmpegFD(ydl, {}) + downloader._debug_cmd = self._test_cmd + + downloader._call_downloader('test', {**TEST_INFO, 'ext': 'mp4'}) + self.assertEqual(self._args, [ + 'ffmpeg', '-y', '-hide_banner', '-i', 'http://www.example.com/', + '-c', 'copy', '-f', 'mp4', 'file:test']) + + # Test cookies arg is added + ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) + downloader._call_downloader('test', {**TEST_INFO, 'ext': 'mp4'}) + self.assertEqual(self._args, [ + 'ffmpeg', '-y', '-hide_banner', '-cookies', 'test=ytdlp; path=/; domain=.example.com;\r\n', + '-i', 'http://www.example.com/', '-c', 'copy', '-f', 'mp4', 'file:test']) + + +if __name__ == '__main__': + unittest.main() diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index f21e4f7e7b..53fe0ec2d3 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1327,6 +1327,13 @@ def get_cookie_header(self, url): self.add_cookie_header(cookie_req) return cookie_req.get_header('Cookie') + def get_cookies_for_url(self, url): + """Generate a list of Cookie objects for a given url""" + # Policy `_now` attribute must be set before calling `_cookies_for_request` + # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360 + self._policy._now = self._now = int(time.time()) + return self._cookies_for_request(urllib.request.Request(escape_url(sanitize_url(url)))) + def clear(self, *args, **kwargs): with contextlib.suppress(KeyError): return super().clear(*args, **kwargs) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index f637a100bf..d4045e58f9 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -1,9 +1,10 @@ import enum import json -import os.path +import os import re import subprocess import sys +import tempfile import time import uuid @@ -42,6 +43,7 @@ class ExternalFD(FragmentFD): def real_download(self, filename, info_dict): self.report_destination(filename) tmpfilename = self.temp_name(filename) + self._cookies_tempfile = None try: started = time.time() @@ -54,6 +56,9 @@ def real_download(self, filename, info_dict): # should take place retval = 0 self.to_screen('[%s] Interrupted by user' % self.get_basename()) + finally: + if self._cookies_tempfile: + self.try_remove(self._cookies_tempfile) if retval == 0: status = { @@ -125,6 +130,16 @@ def _configuration_args(self, keys=None, *args, **kwargs): self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME, keys, *args, **kwargs) + def _write_cookies(self): + if not self.ydl.cookiejar.filename: + tmp_cookies = tempfile.NamedTemporaryFile(suffix='.cookies', delete=False) + tmp_cookies.close() + self._cookies_tempfile = tmp_cookies.name + self.to_screen(f'[download] Writing temporary cookies file to "{self._cookies_tempfile}"') + # real_download resets _cookies_tempfile; if it's None then save() will write to cookiejar.filename + self.ydl.cookiejar.save(self._cookies_tempfile) + return self.ydl.cookiejar.filename or self._cookies_tempfile + def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] @@ -184,6 +199,8 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += ['--cookie-jar', self._write_cookies()] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -214,6 +231,9 @@ def _make_cmd(self, tmpfilename, info_dict): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['-H', f'{key}: {val}'] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += [f'Cookie: {cookie_header}', '--max-redirect=0'] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -223,7 +243,9 @@ class WgetFD(ExternalFD): AVAILABLE_OPT = '--version' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto'] + cmd = [self.exe, '-O', tmpfilename, '-nv', '--compression=auto'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += ['--load-cookies', self._write_cookies()] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -279,6 +301,8 @@ def _make_cmd(self, tmpfilename, info_dict): else: cmd += ['--min-split-size', '1M'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += [f'--load-cookies={self._write_cookies()}'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -417,6 +441,14 @@ def _make_cmd(self, tmpfilename, info_dict): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += [f'{key}:{val}'] + + # httpie 3.1.0+ removes the Cookie header on redirect, so this should be safe for now. [1] + # If we ever need cookie handling for redirects, we can export the cookiejar into a session. [2] + # 1: https://github.com/httpie/httpie/security/advisories/GHSA-9w4w-cpc8-h2fq + # 2: https://httpie.io/docs/cli/sessions + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += [f'Cookie:{cookie_header}'] return cmd @@ -527,6 +559,11 @@ def _call_downloader(self, tmpfilename, info_dict): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): + cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) + if cookies: + args.extend(['-cookies', ''.join( + f'{cookie.name}={cookie.value}; path={cookie.path}; domain={cookie.domain};\r\n' + for cookie in cookies)]) if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. From f8b4bcc0a791274223723488bfbfc23ea3276641 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Tue, 6 Jun 2023 20:44:51 +1200 Subject: [PATCH 260/501] [core] Prevent `Cookie` leaks on HTTP redirect Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj Authored by: coletdjnz --- test/test_http.py | 31 +++++++++++++++++++++++++++++++ yt_dlp/utils/_utils.py | 9 +++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/test/test_http.py b/test/test_http.py index 3941a6e776..e4e66dce18 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -132,6 +132,11 @@ def do_GET(self): self._method('GET') elif self.path.startswith('/headers'): self._headers() + elif self.path.startswith('/308-to-headers'): + self.send_response(308) + self.send_header('Location', '/headers') + self.send_header('Content-Length', '0') + self.end_headers() elif self.path == '/trailing_garbage': payload = b'<html><video src="/vid.mp4" /></html>' self.send_response(200) @@ -270,6 +275,7 @@ def do_req(redirect_status, method): self.assertEqual(do_req(303, 'PUT'), ('', 'GET')) # 301 and 302 turn POST only into a GET + # XXX: we should also test if the Content-Type and Content-Length headers are removed self.assertEqual(do_req(301, 'POST'), ('', 'GET')) self.assertEqual(do_req(301, 'HEAD'), ('', 'HEAD')) self.assertEqual(do_req(302, 'POST'), ('', 'GET')) @@ -313,6 +319,31 @@ def test_cookiejar(self): data = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers')).read() self.assertIn(b'Cookie: test=ytdlp', data) + def test_passed_cookie_header(self): + # We should accept a Cookie header being passed as in normal headers and handle it appropriately. + with FakeYDL() as ydl: + # Specified Cookie header should be used + res = ydl.urlopen( + sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers', + headers={'Cookie': 'test=test'})).read().decode('utf-8') + self.assertIn('Cookie: test=test', res) + + # Specified Cookie header should be removed on any redirect + res = ydl.urlopen( + sanitized_Request(f'http://127.0.0.1:{self.http_port}/308-to-headers', headers={'Cookie': 'test=test'})).read().decode('utf-8') + self.assertNotIn('Cookie: test=test', res) + + # Specified Cookie header should override global cookiejar for that request + ydl.cookiejar.set_cookie(http.cookiejar.Cookie( + version=0, name='test', value='ytdlp', port=None, port_specified=False, + domain='127.0.0.1', domain_specified=True, domain_initial_dot=False, path='/', + path_specified=True, secure=False, expires=None, discard=False, comment=None, + comment_url=None, rest={})) + + data = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'Cookie': 'test=test'})).read() + self.assertNotIn(b'Cookie: test=ytdlp', data) + self.assertIn(b'Cookie: test=test', data) + def test_no_compression_compat_header(self): with FakeYDL() as ydl: data = ydl.urlopen( diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index f68cdb9686..82d9ba4d57 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1556,7 +1556,12 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): new_method = req.get_method() new_data = req.data - remove_headers = [] + + # Technically the Cookie header should be in unredirected_hdrs, + # however in practice some may set it in normal headers anyway. + # We will remove it here to prevent any leaks. + remove_headers = ['Cookie'] + # A 303 must either use GET or HEAD for subsequent request # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 if code == 303 and req.get_method() != 'HEAD': @@ -1573,7 +1578,7 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): new_data = None remove_headers.extend(['Content-Length', 'Content-Type']) - new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers} + new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} return urllib.request.Request( newurl, headers=new_headers, origin_req_host=req.origin_req_host, From 3121512228487c9c690d3d39bfd2579addf96e07 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Thu, 6 Jul 2023 21:51:04 +0530 Subject: [PATCH 261/501] [core] Change how `Cookie` headers are handled Cookies are now saved and loaded under `cookies` key in the info dict instead of `http_headers.Cookie`. Cookies passed in headers are auto-scoped to the input URLs with a warning. Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj Authored by: Grub4K --- test/test_YoutubeDL.py | 56 ++++++++++++++++++++++++++ yt_dlp/YoutubeDL.py | 80 +++++++++++++++++++++++++++++++++++-- yt_dlp/downloader/common.py | 7 +++- 3 files changed, 139 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3fbcdd01f3..c15c7704c5 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1213,6 +1213,62 @@ def _real_extract(self, url): self.assertEqual(downloaded['extractor'], 'Video') self.assertEqual(downloaded['extractor_key'], 'Video') + def test_header_cookies(self): + from http.cookiejar import Cookie + + ydl = FakeYDL() + ydl.report_warning = lambda *_, **__: None + + def cookie(name, value, version=None, domain='', path='', secure=False, expires=None): + return Cookie( + version or 0, name, value, None, False, + domain, bool(domain), bool(domain), path, bool(path), + secure, expires, False, None, None, rest={}) + + _test_url = 'https://yt.dlp/test' + + def test(encoded_cookies, cookies, headers=False, round_trip=None, error=None): + def _test(): + ydl.cookiejar.clear() + ydl._load_cookies(encoded_cookies, from_headers=headers) + if headers: + ydl._apply_header_cookies(_test_url) + data = {'url': _test_url} + ydl._calc_headers(data) + self.assertCountEqual( + map(vars, ydl.cookiejar), map(vars, cookies), + 'Extracted cookiejar.Cookie is not the same') + if not headers: + self.assertEqual( + data.get('cookies'), round_trip or encoded_cookies, + 'Cookie is not the same as round trip') + ydl.__dict__['_YoutubeDL__header_cookies'] = [] + + with self.subTest(msg=encoded_cookies): + if not error: + _test() + return + with self.assertRaisesRegex(Exception, error): + _test() + + test('test=value; Domain=.yt.dlp', [cookie('test', 'value', domain='.yt.dlp')]) + test('test=value', [cookie('test', 'value')], error='Unscoped cookies are not allowed') + test('cookie1=value1; Domain=.yt.dlp; Path=/test; cookie2=value2; Domain=.yt.dlp; Path=/', [ + cookie('cookie1', 'value1', domain='.yt.dlp', path='/test'), + cookie('cookie2', 'value2', domain='.yt.dlp', path='/')]) + test('test=value; Domain=.yt.dlp; Path=/test; Secure; Expires=9999999999', [ + cookie('test', 'value', domain='.yt.dlp', path='/test', secure=True, expires=9999999999)]) + test('test="value; "; path=/test; domain=.yt.dlp', [ + cookie('test', 'value; ', domain='.yt.dlp', path='/test')], + round_trip='test="value\\073 "; Domain=.yt.dlp; Path=/test') + test('name=; Domain=.yt.dlp', [cookie('name', '', domain='.yt.dlp')], + round_trip='name=""; Domain=.yt.dlp') + + test('test=value', [cookie('test', 'value', domain='.yt.dlp')], headers=True) + test('cookie1=value; Domain=.yt.dlp; cookie2=value', [], headers=True, error='Invalid syntax') + ydl.deprecated_feature = ydl.report_error + test('test=value', [], headers=True, error='Passing cookies as a header is a potential security risk') + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index cf0122d4ba..7f55716669 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1,9 +1,11 @@ import collections import contextlib +import copy import datetime import errno import fileinput import functools +import http.cookiejar import io import itertools import json @@ -25,7 +27,7 @@ from .cache import Cache from .compat import urllib # isort: split from .compat import compat_os_name, compat_shlex_quote -from .cookies import load_cookies +from .cookies import LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor @@ -673,6 +675,9 @@ def process_color_policy(stream): if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() + self.__header_cookies = [] + self._load_cookies(traverse_obj(self.params.get('http_headers'), 'cookie', casesense=False)) # compat + def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: self.report_warning(f'{option} is deprecated. Use {suggestion} instead') @@ -1625,8 +1630,60 @@ def progress(msg): self.to_screen('') raise + def _load_cookies(self, data, *, from_headers=True): + """Loads cookies from a `Cookie` header + + This tries to work around the security vulnerability of passing cookies to every domain. + See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + The unscoped cookies are saved for later to be stored in the jar with a limited scope. + + @param data The Cookie header as string to load the cookies from + @param from_headers If `False`, allows Set-Cookie syntax in the cookie string (at least a domain will be required) + """ + for cookie in LenientSimpleCookie(data).values(): + if from_headers and any(cookie.values()): + raise ValueError('Invalid syntax in Cookie Header') + + domain = cookie.get('domain') or '' + expiry = cookie.get('expires') + if expiry == '': # 0 is valid + expiry = None + prepared_cookie = http.cookiejar.Cookie( + cookie.get('version') or 0, cookie.key, cookie.value, None, False, + domain, True, True, cookie.get('path') or '', bool(cookie.get('path')), + cookie.get('secure') or False, expiry, False, None, None, {}) + + if domain: + self.cookiejar.set_cookie(prepared_cookie) + elif from_headers: + self.deprecated_feature( + 'Passing cookies as a header is a potential security risk; ' + 'they will be scoped to the domain of the downloaded urls. ' + 'Please consider loading cookies from a file or browser instead.') + self.__header_cookies.append(prepared_cookie) + else: + self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping', + tb=False, is_error=False) + + def _apply_header_cookies(self, url): + """Applies stray header cookies to the provided url + + This loads header cookies and scopes them to the domain provided in `url`. + While this is not ideal, it helps reduce the risk of them being sent + to an unintended destination while mostly maintaining compatibility. + """ + parsed = urllib.parse.urlparse(url) + if not parsed.hostname: + return + + for cookie in map(copy.copy, self.__header_cookies): + cookie.domain = f'.{parsed.hostname}' + self.cookiejar.set_cookie(cookie) + @_handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): + self._apply_header_cookies(url) + try: ie_result = ie.extract(url) except UserNotLive as e: @@ -2414,9 +2471,24 @@ def _calc_headers(self, info_dict): if 'Youtubedl-No-Compression' in res: # deprecated res.pop('Youtubedl-No-Compression', None) res['Accept-Encoding'] = 'identity' - cookies = self.cookiejar.get_cookie_header(info_dict['url']) + cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) if cookies: - res['Cookie'] = cookies + encoder = LenientSimpleCookie() + values = [] + for cookie in cookies: + _, value = encoder.value_encode(cookie.value) + values.append(f'{cookie.name}={value}') + if cookie.domain: + values.append(f'Domain={cookie.domain}') + if cookie.path: + values.append(f'Path={cookie.path}') + if cookie.secure: + values.append('Secure') + if cookie.expires: + values.append(f'Expires={cookie.expires}') + if cookie.version: + values.append(f'Version={cookie.version}') + info_dict['cookies'] = '; '.join(values) if 'X-Forwarded-For' not in res: x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') @@ -3423,6 +3495,8 @@ def download_with_info_file(self, info_filename): infos = [self.sanitize_info(info, self.params.get('clean_infojson', True)) for info in variadic(json.loads('\n'.join(f)))] for info in infos: + self._load_cookies(info.get('cookies'), from_headers=False) + self._load_cookies(traverse_obj(info.get('http_headers'), 'Cookie', casesense=False)) # compat try: self.__download_wrapper(self.process_ie_result)(info, download=True) except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 8fe9d99930..2c404ee902 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -32,6 +32,7 @@ timetuple_from_msec, try_call, ) +from ..utils.traversal import traverse_obj class FileDownloader: @@ -419,7 +420,6 @@ def download(self, filename, info_dict, subtitle=False): """Download to a filename using the info from info_dict Return True on success and False otherwise """ - nooverwrites_and_exists = ( not self.params.get('overwrites', True) and os.path.exists(encodeFilename(filename)) @@ -453,6 +453,11 @@ def download(self, filename, info_dict, subtitle=False): self.to_screen(f'[download] Sleeping {sleep_interval:.2f} seconds ...') time.sleep(sleep_interval) + # Filter the `Cookie` header from the info_dict to prevent leaks. + # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + info_dict['http_headers'] = dict(traverse_obj(info_dict, ( + 'http_headers', {dict.items}, lambda _, pair: pair[0].lower() != 'cookie'))) or None + ret = self.real_download(filename, info_dict) self._finish_multiline_status() return ret, True From b532a3481046e1eabb6232ee8196fb696c356ff6 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 19:18:35 +0530 Subject: [PATCH 262/501] [docs] Minor fixes Closes #7515 --- Changelog.md | 6 ++++-- README.md | 14 +++++++------- devscripts/changelog_override.json | 20 ++++++++++++++------ 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/Changelog.md b/Changelog.md index c340b74c9c..d7efa5d259 100644 --- a/Changelog.md +++ b/Changelog.md @@ -9,6 +9,8 @@ ### 2023.06.22 #### Core changes - [Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb](https://github.com/yt-dlp/yt-dlp/commit/d7cd97e8d8d42b500fea9abb2aa4ac9b0f98b2ad) by [pukkandan](https://github.com/pukkandan) - [Improve `--download-sections`](https://github.com/yt-dlp/yt-dlp/commit/b4e0d75848e9447cee2cd3646ce54d4744a7ff56) by [pukkandan](https://github.com/pukkandan) + - Support negative time-ranges + - Add `*from-url` to obey time-ranges in URL - [Indicate `filesize` approximated from `tbr` better](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) by [pukkandan](https://github.com/pukkandan) #### Extractor changes @@ -19,7 +21,7 @@ #### Extractor changes - **nebula**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3f756c8c4095b942cf49788eb0862ceaf57847f2) ([#7156](https://github.com/yt-dlp/yt-dlp/issues/7156)) by [Lamieur](https://github.com/Lamieur), [rohieb](https://github.com/rohieb) - **rheinmaintv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/98cb1eda7a4cf67c96078980dbd63e6c06ad7f7c) ([#7311](https://github.com/yt-dlp/yt-dlp/issues/7311)) by [barthelmannk](https://github.com/barthelmannk) - **youtube** - - [Add `ios` to default clients used](https://github.com/yt-dlp/yt-dlp/commit/1e75d97db21152acc764b30a688e516f04b8a142) + - [Add `ios` to default clients used](https://github.com/yt-dlp/yt-dlp/commit/1e75d97db21152acc764b30a688e516f04b8a142) by [pukkandan](https://github.com/pukkandan) - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively - IOS also has higher bit-rate 'premium' formats though they are not labeled as such - [Improve description parsing performance](https://github.com/yt-dlp/yt-dlp/commit/71dc18fa29263a1ff0472c23d81bfc8dd4422d48) ([#7315](https://github.com/yt-dlp/yt-dlp/issues/7315)) by [berkanteber](https://github.com/berkanteber), [pukkandan](https://github.com/pukkandan) @@ -27,7 +29,7 @@ #### Extractor changes - [Workaround 403 for android formats](https://github.com/yt-dlp/yt-dlp/commit/81ca451480051d7ce1a31c017e005358345a9149) by [pukkandan](https://github.com/pukkandan) #### Misc. changes -- [Revert "Add automatic duplicate issue detection"](https://github.com/yt-dlp/yt-dlp/commit/a4486bfc1dc7057efca9dd3fe70d7fa25c56f700) +- [Revert "Add automatic duplicate issue detection"](https://github.com/yt-dlp/yt-dlp/commit/a4486bfc1dc7057efca9dd3fe70d7fa25c56f700) by [pukkandan](https://github.com/pukkandan) - **cleanup** - Miscellaneous - [7f9c6a6](https://github.com/yt-dlp/yt-dlp/commit/7f9c6a63b16e145495479e9f666f5b9e2ee69e2f) by [bashonly](https://github.com/bashonly) diff --git a/README.md b/README.md index 4fb3e450d8..0526fe418a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ [![License: Unlicense](https://img.shields.io/badge/-Unlicense-blue.svg?style=for-the-badge)](LICENSE "License") [![CI Status](https://img.shields.io/github/actions/workflow/status/yt-dlp/yt-dlp/core.yml?branch=master&label=Tests&style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/actions "CI Status") [![Commits](https://img.shields.io/github/commit-activity/m/yt-dlp/yt-dlp?label=commits&style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/commits "Commit History") -[![Last Commit](https://img.shields.io/github/last-commit/yt-dlp/yt-dlp/master?label=&style=for-the-badge&display_timestamp=committer)](https://github.com/yt-dlp/yt-dlp/commits "Commit History") +[![Last Commit](https://img.shields.io/github/last-commit/yt-dlp/yt-dlp/master?label=&style=for-the-badge&display_timestamp=committer)](https://github.com/yt-dlp/yt-dlp/pulse/monthly "Last activity") </div> <!-- MANPAGE: END EXCLUDED SECTION --> @@ -76,7 +76,7 @@ # NEW FEATURES -* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/yt-dlp/yt-dlp/commit/42f2d4) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) +* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API @@ -1323,7 +1323,7 @@ # OUTPUT TEMPLATE - `extractor` (string): Name of the extractor - `extractor_key` (string): Key name of the extractor - `epoch` (numeric): Unix epoch of when the information extraction was completed - - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start` + - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start`, padded with leading zeros to 5 digits - `video_autonumber` (numeric): Number that will be increased with each video - `n_entries` (numeric): Total number of extracted items in the playlist - `playlist_id` (string): Identifier of the playlist that contains the video @@ -1509,7 +1509,7 @@ # FORMAT SELECTION ## Filtering Formats -You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). +You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"` since filters without a selector are interpreted as `best`). The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `>=`, `=` (equals), `!=` (not equals): @@ -1545,7 +1545,7 @@ ## Filtering Formats **Note**: None of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering. -Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. +Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. Format selectors can also be grouped using parentheses; e.g. `-f "(mp4,webm)[height<480]"` will download the best pre-merged mp4 and webm formats with a height lower than 480. @@ -1805,7 +1805,7 @@ #### youtube * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `formats`: Change the types of formats to return. `dashy` (convert http to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) +* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests @@ -1950,7 +1950,7 @@ # EMBEDDING YT-DLP ydl.download(URLS) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L184). +Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L183) or `help(yt_dlp.YoutubeDL)` in a Python shell. If you are already familiar with the CLI, you can use [`devscripts/cli_to_api.py`](https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py) to translate any CLI switches to `YoutubeDL` params. **Tip**: If you are porting your code from youtube-dl to yt-dlp, one important point to look out for is that we do not guarantee the return value of `YoutubeDL.extract_info` to be json serializable, or even be a dictionary. It will be dictionary-like, but if you want to ensure it is a serializable dictionary, pass it through `YoutubeDL.sanitize_info` as shown in the [example below](#extracting-information) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index df80f45e0f..f573a74630 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -1,12 +1,12 @@ [ { "action": "add", - "when": "776d1c3f0c9b00399896dd2e40e78e9a43218109", + "when": "29cb20bd563c02671b31dd840139e93dd37150a1", "short": "[priority] **A new release type has been added!**\n * [`nightly`](https://github.com/yt-dlp/yt-dlp/releases/tag/nightly) builds will be made after each push, containing the latest fixes (but also possibly bugs).\n * When using `--update`/`-U`, a release binary will only update to its current channel (either `stable` or `nightly`).\n * The `--update-to` option has been added allowing the user more control over program upgrades (or downgrades).\n * `--update-to` can change the release channel (`stable`, `nightly`) and also upgrade or downgrade to specific tags.\n * **Usage**: `--update-to CHANNEL`, `--update-to TAG`, `--update-to CHANNEL@TAG`" }, { "action": "add", - "when": "776d1c3f0c9b00399896dd2e40e78e9a43218109", + "when": "5038f6d713303e0967d002216e7a88652401c22a", "short": "[priority] **YouTube throttling fixes!**" }, { @@ -38,13 +38,15 @@ }, { "action": "change", - "when": "7b37e8b23691613f331bd4ebc9d639dd6f93c972", - "short": "Improve `--download-sections`\n - Support negative time-ranges\n - Add `*from-url` to obey time-ranges in URL" + "when": "b4e0d75848e9447cee2cd3646ce54d4744a7ff56", + "short": "Improve `--download-sections`\n - Support negative time-ranges\n - Add `*from-url` to obey time-ranges in URL", + "authors": ["pukkandan"] }, { "action": "change", "when": "1e75d97db21152acc764b30a688e516f04b8a142", - "short": "[extractor/youtube] Add `ios` to default clients used\n - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively\n - IOS also has higher bit-rate 'premium' formats though they are not labeled as such" + "short": "[extractor/youtube] Add `ios` to default clients used\n - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively\n - IOS also has higher bit-rate 'premium' formats though they are not labeled as such", + "authors": ["pukkandan"] }, { "action": "change", @@ -55,6 +57,12 @@ { "action": "change", "when": "a4486bfc1dc7057efca9dd3fe70d7fa25c56f700", - "short": "[misc] Revert \"Add automatic duplicate issue detection\"" + "short": "[misc] Revert \"Add automatic duplicate issue detection\"", + "authors": ["pukkandan"] + }, + { + "action": "add", + "when": "1ceb657bdd254ad961489e5060f2ccc7d556b729", + "short": "[priority] Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj)\n - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains\n - Cookies are scoped when passed to external downloaders\n - Add `cookie` field to info.json and deprecate `http_headers.Cookie`" } ] From cc0619f62d6da52689797483e96b29290b0c0873 Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Thu, 6 Jul 2023 18:57:59 +0000 Subject: [PATCH 263/501] Release 2023.07.06 Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 +-- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 +-- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 +-- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 +-- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 +-- .github/ISSUE_TEMPLATE/6_question.yml | 8 +-- CONTRIBUTORS | 7 +++ Changelog.md | 52 +++++++++++++++++++ supportedsites.md | 6 ++- yt_dlp/version.py | 4 +- 10 files changed, 90 insertions(+), 27 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index a00a11f271..dd1b33dde2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.22, Current version: 2023.06.22 - yt-dlp is up to date (2023.06.22) + Latest version: 2023.07.06, Current version: 2023.07.06 + yt-dlp is up to date (2023.07.06) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index fc1f41ead5..4f4378924d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.22, Current version: 2023.06.22 - yt-dlp is up to date (2023.06.22) + Latest version: 2023.07.06, Current version: 2023.07.06 + yt-dlp is up to date (2023.07.06) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index ed51dfa97d..05b4dd23b3 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.22, Current version: 2023.06.22 - yt-dlp is up to date (2023.06.22) + Latest version: 2023.07.06, Current version: 2023.07.06 + yt-dlp is up to date (2023.07.06) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 1c997f3e27..880f1014c2 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.22, Current version: 2023.06.22 - yt-dlp is up to date (2023.06.22) + Latest version: 2023.07.06, Current version: 2023.07.06 + yt-dlp is up to date (2023.07.06) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 1638945bf5..acb11795f6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.22, Current version: 2023.06.22 - yt-dlp is up to date (2023.06.22) + Latest version: 2023.07.06, Current version: 2023.07.06 + yt-dlp is up to date (2023.07.06) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index d27bd57426..a2563e975b 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.06.22** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.06.22 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.06.22, Current version: 2023.06.22 - yt-dlp is up to date (2023.06.22) + Latest version: 2023.07.06, Current version: 2023.07.06 + yt-dlp is up to date (2023.07.06) <more lines> render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 0864f16c4c..6ccd08931d 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -460,3 +460,10 @@ berkanteber OverlordQ rexlambert22 Ti4eeT4e +AmanSal1 +bbilly1 +meliber +nnoboa +rdamas +RfadnjdExt +urectanc diff --git a/Changelog.md b/Changelog.md index d7efa5d259..622ae68b9b 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,58 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.07.06 + +#### Important changes +- Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj) + - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains + - Cookies are scoped when passed to external downloaders + - Add `cookie` field to info.json and deprecate `http_headers.Cookie` + +#### Core changes +- [Allow extractors to mark formats as potentially DRM](https://github.com/yt-dlp/yt-dlp/commit/bc344cd456380999c1ee74554dfd432a38f32ec7) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan) +- [Bugfix for b4e0d75848e9447cee2cd3646ce54d4744a7ff56](https://github.com/yt-dlp/yt-dlp/commit/e59e20744eb32ce4b6ea0dece7c673be8376a710) by [pukkandan](https://github.com/pukkandan) +- [Change how `Cookie` headers are handled](https://github.com/yt-dlp/yt-dlp/commit/3121512228487c9c690d3d39bfd2579addf96e07) by [Grub4K](https://github.com/Grub4K) +- [Prevent `Cookie` leaks on HTTP redirect](https://github.com/yt-dlp/yt-dlp/commit/f8b4bcc0a791274223723488bfbfc23ea3276641) by [coletdjnz](https://github.com/coletdjnz) +- **formats**: [Fix best fallback for storyboards](https://github.com/yt-dlp/yt-dlp/commit/906c0bdcd8974340d619e99ccd613c163eb0d0c2) by [pukkandan](https://github.com/pukkandan) +- **outtmpl**: [Pad `playlist_index` etc even when with internal formatting](https://github.com/yt-dlp/yt-dlp/commit/47bcd437247152e0af5b3ebc5592db7bb66855c2) by [pukkandan](https://github.com/pukkandan) +- **utils**: clean_podcast_url: [Handle protocol in redirect URL](https://github.com/yt-dlp/yt-dlp/commit/91302ed349f34dc26cc1d661bb45a4b71f4417f7) by [pukkandan](https://github.com/pukkandan) + +#### Extractor changes +- **abc**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/8f05fbae2a79ce0713077ccc68b354e63216bf20) ([#7434](https://github.com/yt-dlp/yt-dlp/issues/7434)) by [meliber](https://github.com/meliber) +- **AdultSwim**: [Extract subtitles from m3u8](https://github.com/yt-dlp/yt-dlp/commit/5e16cf92eb496b7c1541a6b1d727cb87542984db) ([#7421](https://github.com/yt-dlp/yt-dlp/issues/7421)) by [nnoboa](https://github.com/nnoboa) +- **crunchyroll**: music: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/5b4b92769afcc398475e481bfa839f1158902fe9) ([#7439](https://github.com/yt-dlp/yt-dlp/issues/7439)) by [AmanSal1](https://github.com/AmanSal1), [rdamas](https://github.com/rdamas) +- **Douyin**: [Fix extraction from webpage](https://github.com/yt-dlp/yt-dlp/commit/a2be9781fbf4d7e4db245c277ca2ecc41cf3a7b2) by [bashonly](https://github.com/bashonly) +- **googledrive**: [Fix source format extraction](https://github.com/yt-dlp/yt-dlp/commit/3b7f5300c577fef40464d46d4e4037a69d51fe82) ([#7395](https://github.com/yt-dlp/yt-dlp/issues/7395)) by [RfadnjdExt](https://github.com/RfadnjdExt) +- **kick**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/ef8509c300ea50da86aea447eb214d3d6f6db6bb) by [bashonly](https://github.com/bashonly) +- **qdance**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f0a1ff118145b6449982ba401f9a9f656ecd8062) ([#7420](https://github.com/yt-dlp/yt-dlp/issues/7420)) by [bashonly](https://github.com/bashonly) +- **sbs**: [Python 3.7 compat](https://github.com/yt-dlp/yt-dlp/commit/f393bbe724b1fc6c7f754a5da507e807b2b40ad2) by [pukkandan](https://github.com/pukkandan) +- **stacommu**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/af1fd12f675220df6793fc019dff320bc76e8080) ([#7432](https://github.com/yt-dlp/yt-dlp/issues/7432)) by [urectanc](https://github.com/urectanc) +- **twitter** + - [Fix unauthenticated extraction](https://github.com/yt-dlp/yt-dlp/commit/49296437a8e5fa91dacb5446e51ab588474c85d3) ([#7476](https://github.com/yt-dlp/yt-dlp/issues/7476)) by [bashonly](https://github.com/bashonly) + - spaces: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1cffd621cb371f1563563cfb2fe37d137e8a7bee) ([#7512](https://github.com/yt-dlp/yt-dlp/issues/7512)) by [bashonly](https://github.com/bashonly) +- **vidlii**: [Handle relative URLs](https://github.com/yt-dlp/yt-dlp/commit/ad8902f616ad2541f9b9626738f1393fad89a64c) by [pukkandan](https://github.com/pukkandan) +- **vk**: VKPlay, VKPlayLive: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/8776349ef6b1f644584a92dfa00a05208a48edc4) ([#7358](https://github.com/yt-dlp/yt-dlp/issues/7358)) by [c-basalt](https://github.com/c-basalt) +- **youtube** + - [Add extractor-arg `formats`](https://github.com/yt-dlp/yt-dlp/commit/58786a10f212bd63f9ad1d0b4d9e4d31c3b385e2) by [pukkandan](https://github.com/pukkandan) + - [Avoid false DRM detection](https://github.com/yt-dlp/yt-dlp/commit/94ed638a437fc766699d440e978982e24ce6a30a) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan) + - [Fix comments' `is_favorited`](https://github.com/yt-dlp/yt-dlp/commit/89bed013741a776506f60380b7fd89d27d0710b4) ([#7390](https://github.com/yt-dlp/yt-dlp/issues/7390)) by [bbilly1](https://github.com/bbilly1) + - [Ignore incomplete data for comment threads by default](https://github.com/yt-dlp/yt-dlp/commit/4dc4d8473c085900edc841c87c20041233d25b1f) ([#7475](https://github.com/yt-dlp/yt-dlp/issues/7475)) by [coletdjnz](https://github.com/coletdjnz) + - [Process `post_live` over 2 hours](https://github.com/yt-dlp/yt-dlp/commit/d949c10c45bfc359bdacd52e6a180169b8128958) by [pukkandan](https://github.com/pukkandan) + - stories: [Remove](https://github.com/yt-dlp/yt-dlp/commit/90db9a3c00ca80492c6a58c542e4cbf4c2710866) ([#7459](https://github.com/yt-dlp/yt-dlp/issues/7459)) by [pukkandan](https://github.com/pukkandan) + - tab: [Support shorts-only playlists](https://github.com/yt-dlp/yt-dlp/commit/fcbc9ed760be6e3455bbadfaf277b4504b06f068) ([#7425](https://github.com/yt-dlp/yt-dlp/issues/7425)) by [coletdjnz](https://github.com/coletdjnz) + +#### Downloader changes +- **aria2c**: [Add `--no-conf`](https://github.com/yt-dlp/yt-dlp/commit/8a8af356e3bba98a7f7d333aff0777d5d92130c8) by [pukkandan](https://github.com/pukkandan) +- **external**: [Scope cookies](https://github.com/yt-dlp/yt-dlp/commit/1ceb657bdd254ad961489e5060f2ccc7d556b729) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) +- **http**: [Avoid infinite loop when no data is received](https://github.com/yt-dlp/yt-dlp/commit/662ef1e910b72e57957f06589925b2332ba52821) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- [Add CodeQL workflow](https://github.com/yt-dlp/yt-dlp/commit/6355b5f1e1e8e7f4ef866d71d51e03baf0e82f17) ([#7497](https://github.com/yt-dlp/yt-dlp/issues/7497)) by [pukkandan](https://github.com/pukkandan) +- **cleanup**: Miscellaneous: [337734d](https://github.com/yt-dlp/yt-dlp/commit/337734d4a8a6500bc65434843db346b5cbd05e81) by [pukkandan](https://github.com/pukkandan) +- **docs**: [Minor fixes](https://github.com/yt-dlp/yt-dlp/commit/b532a3481046e1eabb6232ee8196fb696c356ff6) by [pukkandan](https://github.com/pukkandan) +- **make_changelog**: [Skip reverted commits](https://github.com/yt-dlp/yt-dlp/commit/fa44802809d189fca0f4782263d48d6533384503) by [pukkandan](https://github.com/pukkandan) + ### 2023.06.22 #### Core changes diff --git a/supportedsites.md b/supportedsites.md index 7d99d9e227..379d28ef38 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1136,6 +1136,7 @@ # Supported sites - **puhutv:serie** - **Puls4** - **Pyvideo** + - **QDance**: [*qdance*](## "netrc machine") - **QingTing** - **qqmusic**: QQ音乐 - **qqmusic:album**: QQ音乐 - 专辑 @@ -1363,6 +1364,8 @@ # Supported sites - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites + - **StacommuLive**: [*stacommu*](## "netrc machine") + - **StacommuVOD**: [*stacommu*](## "netrc machine") - **StagePlusVODConcert**: [*stageplus*](## "netrc machine") - **stanfordoc**: Stanford Open ClassRoom - **StarTrek** @@ -1647,6 +1650,8 @@ # Supported sites - **vk**: [*vk*](## "netrc machine") VK - **vk:uservideos**: [*vk*](## "netrc machine") VK - User's Videos - **vk:wallpost**: [*vk*](## "netrc machine") + - **VKPlay** + - **VKPlayLive** - **vm.tiktok** - **Vocaroo** - **Vodlocker** @@ -1800,7 +1805,6 @@ # Supported sites - **youtube:​search:date**: YouTube search, newest videos first; "ytsearchdate:" prefix - **youtube:search_url**: YouTube search URLs with sorting and filter support - **youtube:​shorts:pivot:audio**: YouTube Shorts audio pivot (Shorts using audio of a given video) - - **youtube:stories**: YouTube channel stories; "ytstories:" prefix - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) - **youtube:tab**: YouTube Tabs - **youtube:user**: YouTube user videos; "ytuser:" prefix diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 434f36f48f..67cfe44efd 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.06.22' +__version__ = '2023.07.06' -RELEASE_GIT_HEAD = '812cdfa06c33a40e73a8e04b3e6f42c084666a43' +RELEASE_GIT_HEAD = 'b532a3481046e1eabb6232ee8196fb696c356ff6' VARIANT = None From b03fa7834579a01cc5fba48c0e73488a16683d48 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 6 Jul 2023 02:00:23 +0530 Subject: [PATCH 264/501] Revert 49296437a8e5fa91dacb5446e51ab588474c85d3 --- README.md | 3 + yt_dlp/extractor/twitter.py | 180 ++++++++++++++++++++++-------------- 2 files changed, 113 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 0526fe418a..655cd41f52 100644 --- a/README.md +++ b/README.md @@ -1852,6 +1852,9 @@ #### tiktok #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` +#### twitter +* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed + #### stacommu, wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index eaf9be5268..1fb9524da6 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,6 +1,5 @@ import json import re -import urllib.error from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE @@ -35,6 +34,7 @@ class TwitterBaseIE(InfoExtractor): _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} + _guest_token = None _flow_token = None _LOGIN_INIT_DATA = json.dumps({ @@ -145,6 +145,14 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) + def _fetch_guest_token(self, headers, display_id): + headers.pop('x-guest-token', None) + self._guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, + 'Downloading guest token', data=b'', headers=headers), 'guest_token') + if not self._guest_token: + raise ExtractorError('Could not retrieve guest token') + def _set_base_headers(self): headers = self._AUTH.copy() csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) @@ -175,15 +183,12 @@ def _perform_login(self, username, password): if self.is_logged_in: return - webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') + self._request_webpage('https://twitter.com/', None, 'Requesting cookies') headers = self._set_base_headers() - guest_token = self._search_regex( - r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._download_json( - f'{self._API_BASE}guest/activate.json', None, 'Downloading guest token', - data=b'', headers=headers)['guest_token'] + self._fetch_guest_token(headers, None) headers.update({ 'content-type': 'application/json', - 'x-guest-token': guest_token, + 'x-guest-token': self._guest_token, 'x-twitter-client-language': 'en', 'x-twitter-active-user': 'yes', 'Referer': 'https://twitter.com/', @@ -280,24 +285,37 @@ def input_dict(subtask_id, text): self.report_login() def _call_api(self, path, video_id, query={}, graphql=False): - if not self.is_logged_in: - self.raise_login_required() - - result = self._download_json( - (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, video_id, - f'Downloading {"GraphQL" if graphql else "legacy API"} JSON', headers={ - **self._set_base_headers(), + headers = self._set_base_headers() + if self.is_logged_in: + headers.update({ 'x-twitter-auth-type': 'OAuth2Session', 'x-twitter-client-language': 'en', 'x-twitter-active-user': 'yes', - }, query=query, expected_status={400, 401, 403, 404} if graphql else {403}) + }) - if result.get('errors'): - errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) - raise ExtractorError( - f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) + for first_attempt in (True, False): + if not self.is_logged_in: + if not self._guest_token: + self._fetch_guest_token(headers, video_id) + headers['x-guest-token'] = self._guest_token - return result + allowed_status = {400, 401, 403, 404} if graphql else {403} + result = self._download_json( + (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, + video_id, headers=headers, query=query, expected_status=allowed_status, + note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') + + if result.get('errors'): + errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) + if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower(): + self.to_screen('Guest token has expired. Refreshing guest token') + self._guest_token = None + continue + + raise ExtractorError( + f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) + + return result def _build_graphql_query(self, media_id): raise NotImplementedError('Method must be implemented to support GraphQL') @@ -439,7 +457,6 @@ class TwitterIE(TwitterBaseIE): _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/(?:video|photo)/(?P<index>\d+))?' _TESTS = [{ - # comment_count, repost_count, view_count are only available with auth (applies to all tests) 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'info_dict': { 'id': '643211870443208704', @@ -454,7 +471,10 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1442188653, 'upload_date': '20150913', 'uploader_url': 'https://twitter.com/freethenipple', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 18, }, @@ -485,6 +505,8 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1447395772, 'upload_date': '20151113', 'uploader_url': 'https://twitter.com/starwars', + 'comment_count': int, + 'repost_count': int, 'like_count': int, 'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'age_limit': 0, @@ -528,7 +550,10 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1455777459, 'upload_date': '20160218', 'uploader_url': 'https://twitter.com/jaydingeer', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, }, @@ -566,7 +591,10 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20160412', 'uploader_url': 'https://twitter.com/CaptainAmerica', 'thumbnail': r're:^https?://.*\.jpg', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -613,7 +641,10 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1505803395, 'upload_date': '20170919', 'uploader_url': 'https://twitter.com/Prefet971', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, }, @@ -636,7 +667,10 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1527623489, 'upload_date': '20180529', 'uploader_url': 'https://twitter.com/LisPower1', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -658,7 +692,10 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1548184644, 'upload_date': '20190122', 'uploader_url': 'https://twitter.com/Twitter', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -676,7 +713,6 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], - 'skip': 'Requires authentication', }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', @@ -693,6 +729,8 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1610651040, 'upload_date': '20210114', 'uploader_url': 'https://twitter.com/BrooklynNets', + 'comment_count': int, + 'repost_count': int, 'like_count': int, 'tags': [], 'age_limit': 0, @@ -715,7 +753,10 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', 'duration': 30.03, 'timestamp': 1665025050, + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -724,13 +765,15 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛 | #вʟм - Test', + 'title': 'Ultima | #\u0432\u029f\u043c - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛 | #вʟм', + 'uploader': 'Ultima | #\u0432\u029f\u043c', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', 'timestamp': 1664992565, + 'comment_count': int, + 'repost_count': int, 'like_count': int, 'tags': [], 'age_limit': 0, @@ -752,7 +795,10 @@ class TwitterIE(TwitterBaseIE): 'duration': 21.321, 'timestamp': 1664477766, 'upload_date': '20220929', + 'comment_count': int, + 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, }, @@ -779,20 +825,6 @@ class TwitterIE(TwitterBaseIE): }, 'skip': 'Requires authentication', }, { - # Single Vimeo video result without auth - 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', - 'info_dict': { - 'id': '551578322', - 'ext': 'mp4', - 'title': 'Dusty & The Mayor', - 'uploader': 'Michael Chau', - 'uploader_id': 'user29061007', - 'uploader_url': 'https://vimeo.com/user29061007', - 'duration': 478, - 'thumbnail': 'https://i.vimeocdn.com/video/1139658575-0dfdce6e9a2401fe09feb24bf0d14e6f24a53c12f447ff688ace61009ad4c1ba-d_1280', - }, - }, { - # Playlist result only with auth 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -810,7 +842,6 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/Srirachachau', 'timestamp': 1621447860, }, - 'skip': 'Requires authentication', }, { 'url': 'https://twitter.com/DavidToons_/status/1578353380363501568', 'playlist_mincount': 2, @@ -829,7 +860,6 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20221007', 'age_limit': 0, }, - 'skip': 'Requires authentication', }, { 'url': 'https://twitter.com/primevideouk/status/1578401165338976258', 'playlist_count': 2, @@ -843,6 +873,8 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20221007', 'age_limit': 0, 'uploader_url': 'https://twitter.com/primevideouk', + 'comment_count': int, + 'repost_count': int, 'like_count': int, 'tags': ['TheRingsOfPower'], }, @@ -864,7 +896,6 @@ class TwitterIE(TwitterBaseIE): }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, - 'skip': 'Requires authentication', }, { # URL specifies video number but --yes-playlist 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -874,7 +905,9 @@ class TwitterIE(TwitterBaseIE): 'title': 'md5:be05989b0722e114103ed3851a0ffae2', 'timestamp': 1670459604.0, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', + 'comment_count': int, 'uploader_id': 'CTVJLaidlaw', + 'repost_count': int, 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], 'upload_date': '20221208', 'age_limit': 0, @@ -893,11 +926,14 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670459604.0, 'uploader_id': 'CTVJLaidlaw', 'uploader': 'Jocelyn Laidlaw', + 'repost_count': int, + 'comment_count': int, 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], 'duration': 102.226, 'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, + 'view_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'upload_date': '20221208', 'age_limit': 0, @@ -923,6 +959,9 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 18, 'tags': [], 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'view_count': int, }, }, { 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -935,7 +974,10 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'uploader_id': 'hlo_again', 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1599108643743473680/pu/img/UG3xjov4rgg5sbYM.jpg?name=orig', + 'repost_count': int, 'duration': 9.531, + 'comment_count': int, + 'view_count': int, 'upload_date': '20221203', 'age_limit': 0, 'timestamp': 1670092210.0, @@ -952,11 +994,14 @@ class TwitterIE(TwitterBaseIE): 'ext': 'mp4', 'uploader_url': 'https://twitter.com/MunTheShinobi', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', + 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, 'uploader': 'Mün The Shinobi', + 'repost_count': int, 'upload_date': '20221206', 'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'comment_count': int, 'like_count': int, 'tags': [], 'uploader_id': 'MunTheShinobi', @@ -964,14 +1009,14 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670306984.0, }, }, { - # url to retweet id + # url to retweet id, legacy API 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', 'display_id': '1623739803874349067', 'ext': 'mp4', 'title': 'Johnny Bullets - Me after going viral to over 30million people: Whoopsie-daisy', - 'description': 'md5:224d62f54b0cdef8e33d4c56c41ac503', + 'description': 'md5:e873616a4a8fe0f93e71872678a672f3', 'uploader': 'Johnny Bullets', 'uploader_id': 'Johnnybull3ts', 'uploader_url': 'https://twitter.com/Johnnybull3ts', @@ -982,7 +1027,10 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20230208', 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', 'like_count': int, + 'repost_count': int, + 'comment_count': int, }, + 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1033,6 +1081,8 @@ def _graphql_to_legacy(self, data, twid): if 'tombstone' in result: cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') + if cause and 'adult content' in cause: + self.raise_login_required(cause) raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) status = result.get('legacy', {}) @@ -1088,22 +1138,19 @@ def _build_graphql_query(self, media_id): def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') - if not self.is_logged_in: - try: - status = self._download_json( - 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', - headers={'User-Agent': 'Googlebot'}, query={'id': twid}) - self.to_screen(f'Some metadata is missing without authentication. {self._login_hint()}') - except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: - self.raise_login_required('Requested tweet may only be available when logged in') - raise + if self._configuration_arg('legacy_api') and not self.is_logged_in: + status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', + }), 'retweeted_status', None) else: - status = self._graphql_to_legacy( - self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) + status = self._graphql_to_legacy(result, twid) - title = description = traverse_obj( - status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' + title = description = status['full_text'].replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) user = status.get('user') or {} @@ -1129,16 +1176,12 @@ def _real_extract(self, url): def extract_from_video_info(media): media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) - if not media_id: - # workaround for non-authenticated responses - media_id = traverse_obj(media, ( - 'video_info', 'variants', ..., 'url', - {lambda x: re.search(r'_video/(\d+)/', x)[1]}), get_all=False) self.write_debug(f'Extracting from video info: {media_id}') + video_info = media.get('video_info') or {} formats = [] subtitles = {} - for variant in traverse_obj(media, ('video_info', 'variants', ...)): + for variant in video_info.get('variants', []): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) @@ -1158,12 +1201,12 @@ def add_thumbnail(name, size): add_thumbnail('orig', media.get('original_info') or {}) return { - 'id': media_id or twid, + 'id': media_id, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), - 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), + 'duration': float_or_none(video_info.get('duration_millis'), 1000), # The codec of http formats are unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'), } @@ -1243,15 +1286,12 @@ def get_binding_value(k): } videos = traverse_obj(status, ( - ('mediaDetails', ((None, 'quoted_status'), 'extended_entities', 'media')), - lambda _, m: m['type'] != 'photo', {dict})) + (None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo', {dict})) if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'): selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card'))) else: - desired_obj = traverse_obj(status, ( - ('mediaDetails', ((None, 'quoted_status'), 'extended_entities', 'media')), - int(selected_index) - 1, {dict}), get_all=False) + desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict})) if not desired_obj: raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True) elif desired_obj.get('type') != 'video': From 92315c03774cfabb3a921884326beb4b981f786b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 6 Jul 2023 14:39:51 -0500 Subject: [PATCH 265/501] [extractor/twitter] Fix GraphQL and legacy API (#7516) Authored by: bashonly --- yt_dlp/extractor/twitter.py | 191 +++++++++++++++++++++++------------- 1 file changed, 124 insertions(+), 67 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 1fb9524da6..fc157ac228 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -11,6 +11,7 @@ from ..utils import ( ExtractorError, dict_get, + filter_dict, float_or_none, format_field, int_or_none, @@ -33,8 +34,8 @@ class TwitterBaseIE(InfoExtractor): _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' - _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} - _guest_token = None + _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' _flow_token = None _LOGIN_INIT_DATA = json.dumps({ @@ -145,20 +146,21 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _fetch_guest_token(self, headers, display_id): - headers.pop('x-guest-token', None) - self._guest_token = traverse_obj(self._download_json( - f'{self._API_BASE}guest/activate.json', display_id, - 'Downloading guest token', data=b'', headers=headers), 'guest_token') - if not self._guest_token: + def _fetch_guest_token(self, display_id): + guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', + headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), + ('guest_token', {str})) + if not guest_token: raise ExtractorError('Could not retrieve guest token') + return guest_token - def _set_base_headers(self): - headers = self._AUTH.copy() - csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) - if csrf_token: - headers['x-csrf-token'] = csrf_token - return headers + def _set_base_headers(self, legacy=False): + bearer_token = self._LEGACY_AUTH if legacy and not self.is_logged_in else self._AUTH + return filter_dict({ + 'Authorization': f'Bearer {bearer_token}', + 'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value), + }) def _call_login_api(self, note, headers, query={}, data=None): response = self._download_json( @@ -183,17 +185,18 @@ def _perform_login(self, username, password): if self.is_logged_in: return - self._request_webpage('https://twitter.com/', None, 'Requesting cookies') - headers = self._set_base_headers() - self._fetch_guest_token(headers, None) - headers.update({ + webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') + guest_token = self._search_regex( + r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None) + headers = { + **self._set_base_headers(), 'content-type': 'application/json', - 'x-guest-token': self._guest_token, + 'x-guest-token': guest_token, 'x-twitter-client-language': 'en', 'x-twitter-active-user': 'yes', 'Referer': 'https://twitter.com/', 'Origin': 'https://twitter.com', - }) + } def build_login_json(*subtask_inputs): return json.dumps({ @@ -285,37 +288,26 @@ def input_dict(subtask_id, text): self.report_login() def _call_api(self, path, video_id, query={}, graphql=False): - headers = self._set_base_headers() - if self.is_logged_in: - headers.update({ - 'x-twitter-auth-type': 'OAuth2Session', - 'x-twitter-client-language': 'en', - 'x-twitter-active-user': 'yes', - }) + headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) + headers.update({ + 'x-twitter-auth-type': 'OAuth2Session', + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + } if self.is_logged_in else { + 'x-guest-token': self._fetch_guest_token(video_id) + }) + allowed_status = {400, 401, 403, 404} if graphql else {403} + result = self._download_json( + (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, + video_id, headers=headers, query=query, expected_status=allowed_status, + note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') - for first_attempt in (True, False): - if not self.is_logged_in: - if not self._guest_token: - self._fetch_guest_token(headers, video_id) - headers['x-guest-token'] = self._guest_token + if result.get('errors'): + errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) + raise ExtractorError( + f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) - allowed_status = {400, 401, 403, 404} if graphql else {403} - result = self._download_json( - (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, - video_id, headers=headers, query=query, expected_status=allowed_status, - note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') - - if result.get('errors'): - errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) - if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower(): - self.to_screen('Guest token has expired. Refreshing guest token') - self._guest_token = None - continue - - raise ExtractorError( - f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) - - return result + return result def _build_graphql_query(self, media_id): raise NotImplementedError('Method must be implemented to support GraphQL') @@ -765,9 +757,9 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima | #\u0432\u029f\u043c - Test', + 'title': 'Ultima📛 | #вʟм - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima | #\u0432\u029f\u043c', + 'uploader': 'Ultima📛 | #вʟм', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -825,6 +817,7 @@ class TwitterIE(TwitterBaseIE): }, 'skip': 'Requires authentication', }, { + # Playlist result only with auth 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -896,6 +889,7 @@ class TwitterIE(TwitterBaseIE): }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, + 'skip': 'Requires authentication', }, { # URL specifies video number but --yes-playlist 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -1009,14 +1003,14 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670306984.0, }, }, { - # url to retweet id, legacy API + # url to retweet id w/ legacy api 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', 'display_id': '1623739803874349067', 'ext': 'mp4', 'title': 'Johnny Bullets - Me after going viral to over 30million people: Whoopsie-daisy', - 'description': 'md5:e873616a4a8fe0f93e71872678a672f3', + 'description': 'md5:b06864cd3dc2554821cc327f5348485a', 'uploader': 'Johnny Bullets', 'uploader_id': 'Johnnybull3ts', 'uploader_url': 'https://twitter.com/Johnnybull3ts', @@ -1028,9 +1022,31 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', 'like_count': int, 'repost_count': int, - 'comment_count': int, }, 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, + }, { + # orig tweet w/ graphql + 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', + 'info_dict': { + 'id': '1623274794488659969', + 'display_id': '1623739803874349067', + 'ext': 'mp4', + 'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy', + 'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', + 'uploader': '@selfisekai@hackerspace.pl 🐀', + 'uploader_id': 'liberdalau', + 'uploader_url': 'https://twitter.com/liberdalau', + 'age_limit': 0, + 'tags': [], + 'duration': 8.033, + 'timestamp': 1675964711.0, + 'upload_date': '20230209', + 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', + 'like_count': int, + 'view_count': int, + 'repost_count': int, + 'comment_count': int, + }, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1073,17 +1089,21 @@ def _graphql_to_legacy(self, data, twid): result = traverse_obj(data, ( 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', - 'tweet_results', 'result', ('tweet', None), - ), expected_type=dict, default={}, get_all=False) + 'tweet_results', 'result', ('tweet', None), {dict}, + ), default={}, get_all=False) if self.is_logged_in else traverse_obj( + data, ('tweetResult', 'result', {dict}), default={}) - if result.get('__typename') not in ('Tweet', 'TweetTombstone', None): + if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) if 'tombstone' in result: cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') - if cause and 'adult content' in cause: - self.raise_login_required(cause) raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) + elif result.get('__typename') == 'TweetUnavailable': + reason = result.get('reason') + if reason == 'NsfwLoggedOut': + self.raise_login_required('NSFW tweet requires authentication') + raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) status = result.get('legacy', {}) status.update(traverse_obj(result, { @@ -1134,11 +1154,42 @@ def _build_graphql_query(self, media_id): 'verified_phone_label_enabled': False, 'vibe_api_enabled': True, }, + } if self.is_logged_in else { + 'variables': { + 'tweetId': media_id, + 'withCommunity': False, + 'includePromotedContent': False, + 'withVoice': False, + }, + 'features': { + 'creator_subscriptions_tweet_preview_api_enabled': True, + 'tweetypie_unmention_optimization_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'responsive_web_twitter_article_tweet_consumption_enabled': False, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': True, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True, + 'longform_notetweets_rich_text_read_enabled': True, + 'longform_notetweets_inline_media_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': True, + 'verified_phone_label_enabled': False, + 'responsive_web_media_download_video_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_enhance_cards_enabled': False + }, + 'fieldToggles': { + 'withArticleRichContentState': False + } } def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') - if self._configuration_arg('legacy_api') and not self.is_logged_in: + if not self.is_logged_in and self._configuration_arg('legacy_api'): status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, @@ -1146,11 +1197,15 @@ def _real_extract(self, url): 'include_user_entities': 0, 'tweet_mode': 'extended', }), 'retweeted_status', None) + elif not self.is_logged_in: + status = self._graphql_to_legacy( + self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) else: - result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) - status = self._graphql_to_legacy(result, twid) + status = self._graphql_to_legacy( + self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) - title = description = status['full_text'].replace('\n', ' ') + title = description = traverse_obj( + status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) user = status.get('user') or {} @@ -1177,11 +1232,10 @@ def _real_extract(self, url): def extract_from_video_info(media): media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) self.write_debug(f'Extracting from video info: {media_id}') - video_info = media.get('video_info') or {} formats = [] subtitles = {} - for variant in video_info.get('variants', []): + for variant in traverse_obj(media, ('video_info', 'variants', ...)): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) @@ -1206,7 +1260,7 @@ def add_thumbnail(name, size): 'subtitles': subtitles, 'thumbnails': thumbnails, 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), - 'duration': float_or_none(video_info.get('duration_millis'), 1000), + 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), # The codec of http formats are unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'), } @@ -1291,7 +1345,8 @@ def get_binding_value(k): if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'): selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card'))) else: - desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict})) + desired_obj = traverse_obj(status, ( + (None, 'quoted_status'), 'extended_entities', 'media', int(selected_index) - 1, {dict}), get_all=False) if not desired_obj: raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True) elif desired_obj.get('type') != 'video': @@ -1481,6 +1536,8 @@ def _build_graphql_query(self, space_id): def _real_extract(self, url): space_id = self._match_id(url) + if not self.is_logged_in: + self.raise_login_required('Twitter Spaces require authentication') space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] if not space_data: raise ExtractorError('Twitter Space not found', expected=True) From bdd0b75e3f41ff35440eda6d395008beef19ef2f Mon Sep 17 00:00:00 2001 From: GD-Slime <82302542+GD-Slime@users.noreply.github.com> Date: Sun, 9 Jul 2023 06:26:03 +0800 Subject: [PATCH 266/501] [ie/BiliBiliBangumi] Fix extractors (#7337) - Overhaul BiliBiliBangumi extractor for the site's new API - Add BiliBiliBangumiSeason extractor - Refactor BiliBiliBangumiMedia extractor Closes #6701, Closes #7400 Authored by: GD-Slime --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bilibili.py | 129 +++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 45 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c0a330dbe5..1e7f165ab9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -214,6 +214,7 @@ from .bilibili import ( BiliBiliIE, BiliBiliBangumiIE, + BiliBiliBangumiSeasonIE, BiliBiliBangumiMediaIE, BiliBiliSearchIE, BilibiliCategoryIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 6629fbc08c..e8714a33ab 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -18,6 +18,7 @@ float_or_none, format_field, int_or_none, + join_nonempty, make_archive_id, merge_dicts, mimetype2ext, @@ -135,6 +136,17 @@ def _get_all_children(self, reply): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children + def _get_episodes_from_season(self, ss_id, url): + season_info = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', ss_id, + note='Downloading season info', query={'season_id': ss_id}, + headers={'Referer': url, **self.geo_verification_headers()}) + + for entry in traverse_obj(season_info, ( + 'result', 'main_section', 'episodes', + lambda _, v: url_or_none(v['share_url']) and v['id'])): + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') + class BiliBiliIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' @@ -403,76 +415,93 @@ def _real_extract(self, url): class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)' _TESTS = [{ - 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { - 'id': 'ss897', + 'id': '267851', 'ext': 'mp4', - 'series': '神的记事本', - 'season': '神的记事本', - 'season_id': 897, + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '鬼灭之刃', + 'season_id': '26801', 'season_number': 1, - 'episode': '你与旅行包', - 'episode_number': 2, - 'title': '神的记事本:第2话 你与旅行包', - 'duration': 1428.487, - 'timestamp': 1310809380, - 'upload_date': '20110716', - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'episode': '残酷', + 'episode_id': '267851', + 'episode_number': 1, + 'title': '1 残酷', + 'duration': 1425.256, + 'timestamp': 1554566400, + 'upload_date': '20190406', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, - }, { - 'url': 'https://www.bilibili.com/bangumi/play/ep508406', - 'only_matching': True, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) + episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') - elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage - or '正在观看预览,大会员免费看全片' in webpage): + elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + headers = {'Referer': url, **self.geo_verification_headers()} + play_info = self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, + 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, + headers=headers) + premium_only = play_info.get('code') == -10403 + play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} + formats = self.extract_formats(play_info) - if (not formats and '成为大会员抢先看' in webpage - and play_info.get('durl') and not play_info.get('dash')): + if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + bangumi_info = self._download_json( + 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', + query={'ep_id': episode_id}, headers=headers)['result'] - season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + episode_number, episode_info = next(( + (idx, ep) for idx, ep in enumerate(traverse_obj( + bangumi_info, ('episodes', ..., {dict})), 1) + if str_or_none(ep.get('id')) == episode_id), (1, {})) + + season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( - traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) + aid = episode_info.get('aid') + return { 'id': video_id, 'formats': formats, - 'title': traverse_obj(initial_state, 'h1Title'), - 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), - 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), - 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), - 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), - 'season_id': season_id, + **traverse_obj(bangumi_info, { + 'series': ('series', 'series_title', {str}), + 'series_id': ('series', 'series_id', {str_or_none}), + 'thumbnail': ('square_cover', {url_or_none}), + }), + 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), + 'episode': episode_info.get('long_title'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_info.get('title')) or episode_number, + 'season_id': str_or_none(season_id), 'season_number': season_number, - 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), - 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles( - video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), - '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), - 'http_headers': {'Referer': url, **self.geo_verification_headers()}, + 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), + '__post_extractor': self.extract_comments(aid), + 'http_headers': headers, } -class BiliBiliBangumiMediaIE(InfoExtractor): +class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', @@ -485,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) + ss_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) - episode_list = self._download_json( - 'https://api.bilibili.com/pgc/web/season/section', media_id, - query={'season_id': initial_state['mediaInfo']['season_id']}, - note='Downloading season info')['result']['main_section']['episodes'] + return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) - return self.playlist_result(( - self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) - for entry in episode_list), media_id) + +class BiliBiliBangumiSeasonIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss26801', + 'info_dict': { + 'id': '26801' + }, + 'playlist_mincount': 26 + }] + + def _real_extract(self, url): + ss_id = self._match_id(url) + + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): From 325191d0c9bf3fe257b8a7c2eb95080f44f6ddfc Mon Sep 17 00:00:00 2001 From: Zprokkel <105783800+Zprokkel@users.noreply.github.com> Date: Mon, 10 Jul 2023 15:15:47 +0200 Subject: [PATCH 267/501] [ie/vrt] Update token signing key (#7519) Authored by: Zprokkel --- yt_dlp/extractor/vrt.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index bacd3df29a..0058357122 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -44,9 +44,11 @@ class VRTBaseIE(GigyaBaseIE): 'version': '2.7.4-prod-2023-04-19T06:05:45' } } - # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.fd1de01a40a1e3d842ea.js + # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' - _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' + _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev + # player-stag.vrt.be key: d23987504521ae6fbf2716caca6700a24bb1579477b43c84e146b279de5ca595 + # player.vrt.be key: 2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae def _extract_formats_and_subtitles(self, data, video_id): if traverse_obj(data, 'drm'): From 2af4eeb77246b8183aae75a0a8d19f18c08115b2 Mon Sep 17 00:00:00 2001 From: Mahmoud Abdel-Fattah <accounts@abdel-fattah.net> Date: Tue, 11 Jul 2023 05:00:38 +0400 Subject: [PATCH 268/501] [utils] `clean_podcast_url`: Handle more trackers (#7556) Authored by: mabdelfattah, bashonly Closes #7544 --- test/test_utils.py | 2 ++ yt_dlp/utils/_utils.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a22f25d730..bdbd2d8796 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1835,6 +1835,8 @@ def test_iri_to_uri(self): def test_clean_podcast_url(self): self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + self.assertEqual(clean_podcast_url('https://pdst.fm/e/2.gum.fm/chtbl.com/track/chrt.fm/track/34D33/pscrb.fm/rss/p/traffic.megaphone.fm/ITLLC7765286967.mp3?updated=1687282661'), 'https://traffic.megaphone.fm/ITLLC7765286967.mp3?updated=1687282661') + self.assertEqual(clean_podcast_url('https://pdst.fm/e/https://mgln.ai/e/441/www.buzzsprout.com/1121972/13019085-ep-252-the-deep-life-stack.mp3'), 'https://www.buzzsprout.com/1121972/13019085-ep-252-the-deep-life-stack.mp3') def test_LazyList(self): it = list(range(10)) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 82d9ba4d57..3023c33b24 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5123,14 +5123,18 @@ def clean_podcast_url(url): (?: chtbl\.com/track| media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ - play\.podtrac\.com - )/[^/]+| + play\.podtrac\.com| + chrt\.fm/track| + mgln\.ai/e + )(?:/[^/.]+)?| (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure flex\.acast\.com| pd(?: cn\.co| # https://podcorn.com/analytics-prefix/ st\.fm # https://podsights.com/docs/ - )/e + )/e| + [0-9]\.gum\.fm| + pscrb\.fm/rss/p )/''', '', url) return re.sub(r'^\w+://(\w+://)', r'\1', url) From 2cfe221fbbe46faa3f46552c08d947a51f424903 Mon Sep 17 00:00:00 2001 From: Aleri Kaisattera <73682764+alerikaisattera@users.noreply.github.com> Date: Thu, 13 Jul 2023 20:17:05 +0600 Subject: [PATCH 269/501] [ie/streamanity] Remove (#7571) Service is dead Authored by: alerikaisattera --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/streamanity.py | 47 --------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 yt_dlp/extractor/streamanity.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1e7f165ab9..2af99b3dad 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1871,7 +1871,6 @@ StoryFireSeriesIE, ) from .streamable import StreamableIE -from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streamff import StreamFFIE diff --git a/yt_dlp/extractor/streamanity.py b/yt_dlp/extractor/streamanity.py deleted file mode 100644 index 6eaee52d95..0000000000 --- a/yt_dlp/extractor/streamanity.py +++ /dev/null @@ -1,47 +0,0 @@ -from .common import InfoExtractor - - -class StreamanityIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamanity\.com/video/(?P<id>[A-Za-z0-9]+)' - _TESTS = [{ - 'url': 'https://streamanity.com/video/9DFPTnuYi8f2', - 'md5': '6ab171e8d4a02ad5dcbff6bea44cf5a1', - 'info_dict': { - 'id': '9DFPTnuYi8f2', - 'ext': 'mp4', - 'title': 'Bitcoin vs The Lighting Network', - 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png', - 'description': '', - 'uploader': 'Tom Bombadil (Freddy78)', - } - }, { - 'url': 'https://streamanity.com/video/JktOUjSlfzTD', - 'md5': '31f131e28abd3377c38be586a59532dc', - 'info_dict': { - 'id': 'JktOUjSlfzTD', - 'ext': 'mp4', - 'title': 'Share data when you see it', - 'thumbnail': r're:https://res\.cloudinary\.com/.+\.png', - 'description': 'Reposting as data should be public and stored on blockchain', - 'uploader': 'digitalcurrencydaily', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json( - f'https://app.streamanity.com/api/video/{video_id}', video_id)['data']['video'] - - formats = self._extract_m3u8_formats( - f'https://stream.mux.com/{video_info["play_id"]}.m3u8?token={video_info["token"]}', - video_id, ext='mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': video_info['title'], - 'description': video_info.get('description'), - 'uploader': video_info.get('author_name'), - 'is_live': False, - 'thumbnail': video_info.get('thumb'), - 'formats': formats, - } From 8a4cd12c8f8e93292e3e95200b9d17a3af39624c Mon Sep 17 00:00:00 2001 From: Neurognostic <donovan@tremura.email> Date: Thu, 13 Jul 2023 16:39:21 -0400 Subject: [PATCH 270/501] [pp/EmbedThumbnail] Support `m4v` (#7583) Authored by: Neurognostic --- yt_dlp/postprocessor/embedthumbnail.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 88a767132a..d7be0b398e 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -114,7 +114,7 @@ def run(self, info): self._report_run('ffmpeg', filename) self.run_ffmpeg(filename, temp_filename, options) - elif info['ext'] in ['m4a', 'mp4', 'mov']: + elif info['ext'] in ['m4a', 'mp4', 'm4v', 'mov']: prefer_atomicparsley = 'embed-thumbnail-atomicparsley' in self.get_param('compat_opts', []) # Method 1: Use mutagen if not mutagen or prefer_atomicparsley: @@ -213,7 +213,7 @@ def run(self, info): temp_filename = filename else: - raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/mov') + raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus/flac, m4a/mp4/m4v/mov') if success and temp_filename != filename: os.replace(temp_filename, filename) From 1bcb9fe8715b1f288efc322be3de409ee0597080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Finn=20R=2E=20G=C3=A4rtner?= <65015656+FinnRG@users.noreply.github.com> Date: Fri, 14 Jul 2023 20:09:02 +0200 Subject: [PATCH 271/501] [ie/piapro] Support `/content` URL (#7592) Authored by: FinnRG --- yt_dlp/extractor/piapro.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index d8d9c78010..eb5923d110 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -12,17 +12,22 @@ class PiaproIE(InfoExtractor): _NETRC_MACHINE = 'piapro' - _VALID_URL = r'https?://piapro\.jp/t/(?P<id>\w+)/?' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>\w+)/?' _TESTS = [{ 'url': 'https://piapro.jp/t/NXYR', - 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77', + 'md5': 'f7c0f760913fb1d44a1c45a4af793909', 'info_dict': { 'id': 'NXYR', 'ext': 'mp3', 'uploader': 'wowaka', 'uploader_id': 'wowaka', 'title': '裏表ラバーズ', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'http://www.nicovideo.jp/watch/sm8082467', + 'duration': 189.0, + 'timestamp': 1251785475, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'upload_date': '20090901', + 'view_count': int, } }, { 'note': 'There are break lines in description, mandating (?s) flag', @@ -34,8 +39,16 @@ class PiaproIE(InfoExtractor): 'title': '青に溶けた風船 / 初音ミク', 'description': 'md5:d395a9bd151447631a5a1460bc7f9132', 'uploader': 'シアン・キノ', + 'duration': 229.0, + 'timestamp': 1644030039, + 'upload_date': '20220205', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'uploader_id': 'cyankino', } + }, { + 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', + 'only_matching': True }] _login_status = False From 1ba6fe9db5f660d5538588315c23ad6cf0371c5f Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 15 Jul 2023 15:20:24 +1200 Subject: [PATCH 272/501] [ie/youtube:tab] Detect looping feeds (#6621) Closes https://github.com/yt-dlp/yt-dlp/issues/5555 Note: the first page may still be repeated, however this is better than nothing. Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 73bfa662d2..826bbb20e1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4921,10 +4921,15 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) yield from extract_entries(parent_renderer) continuation = continuation_list[0] - + seen_continuations = set() for page_num in itertools.count(1): if not continuation: break + continuation_token = continuation.get('continuation') + if continuation_token is not None and continuation_token in seen_continuations: + self.write_debug('Detected YouTube feed looping - assuming end of feed.') + break + seen_continuations.add(continuation_token) headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) response = self._extract_response( From 1b392f905d20ef1f1b300b180f867d43c9ce49b8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 15 Jul 2023 11:41:08 +0530 Subject: [PATCH 273/501] [utils] Add temporary shim for logging Related: #5680, #7517 --- test/test_downloader_http.py | 12 +----------- yt_dlp/cookies.py | 23 ++++------------------- yt_dlp/utils/_utils.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 381b2583cd..099ec2fff4 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -16,6 +16,7 @@ from yt_dlp import YoutubeDL from yt_dlp.downloader.http import HttpFD from yt_dlp.utils import encodeFilename +from yt_dlp.utils._utils import _YDLLogger as FakeLogger TEST_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -67,17 +68,6 @@ def do_GET(self): assert False -class FakeLogger: - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHttpFD(unittest.TestCase): def setUp(self): self.httpd = http.server.HTTPServer( diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 53fe0ec2d3..16f1918e6a 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -41,30 +41,15 @@ try_call, write_string, ) +from .utils._utils import _YDLLogger CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} -class YDLLogger: - def __init__(self, ydl=None): - self._ydl = ydl - - def debug(self, message): - if self._ydl: - self._ydl.write_debug(message) - - def info(self, message): - if self._ydl: - self._ydl.to_screen(f'[Cookies] {message}') - - def warning(self, message, only_once=False): - if self._ydl: - self._ydl.report_warning(message, only_once) - - def error(self, message): - if self._ydl: - self._ydl.report_error(message) +class YDLLogger(_YDLLogger): + def warning(self, message, only_once=False): # compat + return super().warning(message, once=only_once) class ProgressBar(MultilinePrinter): _DELAY, _timer = 0.1, 0 diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 3023c33b24..4af955743d 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5994,3 +5994,33 @@ def calculate_preference(self, format): format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None return tuple(self._calculate_field_preference(format, field) for field in self._order) + + +# XXX: Temporary +class _YDLLogger: + def __init__(self, ydl=None): + self._ydl = ydl + + def debug(self, message): + if self._ydl: + self._ydl.write_debug(message) + + def info(self, message): + if self._ydl: + self._ydl.to_screen(message) + + def warning(self, message, *, once=False): + if self._ydl: + self._ydl.report_warning(message, only_once=once) + + def error(self, message, *, is_error=True): + if self._ydl: + self._ydl.report_error(message, is_error=is_error) + + def stdout(self, message): + if self._ydl: + self._ydl.to_stdout(message) + + def stderr(self, message): + if self._ydl: + self._ydl.to_stderr(message) From c365dba8430ee33abda85d31f95128605bf240eb Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 15 Jul 2023 14:30:08 +0530 Subject: [PATCH 274/501] [networking] Add module (#2861) No actual changes - code is only moved around --- Makefile | 2 +- devscripts/make_changelog.py | 1 + test/{test_http.py => test_networking.py} | 0 test/test_utils.py | 18 +- yt_dlp/YoutubeDL.py | 40 +- yt_dlp/networking/__init__.py | 0 yt_dlp/networking/_helper.py | 139 +++++++ yt_dlp/networking/_urllib.py | 315 ++++++++++++++ yt_dlp/networking/exceptions.py | 9 + yt_dlp/utils/__init__.py | 5 +- yt_dlp/utils/_deprecated.py | 19 + yt_dlp/utils/_utils.py | 479 +--------------------- yt_dlp/utils/networking.py | 60 +++ 13 files changed, 587 insertions(+), 500 deletions(-) rename test/{test_http.py => test_networking.py} (100%) create mode 100644 yt_dlp/networking/__init__.py create mode 100644 yt_dlp/networking/_helper.py create mode 100644 yt_dlp/networking/_urllib.py create mode 100644 yt_dlp/networking/exceptions.py create mode 100644 yt_dlp/utils/networking.py diff --git a/Makefile b/Makefile index b1ac0e7d68..c85b24c13e 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ offlinetest: codetest $(PYTHON) -m pytest -k "not download" # XXX: This is hard to maintain -CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/compat/urllib yt_dlp/utils yt_dlp/dependencies +CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/compat/urllib yt_dlp/utils yt_dlp/dependencies yt_dlp/networking yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip for d in $(CODE_FOLDERS) ; do \ diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 3ad4c5408b..157c661267 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -54,6 +54,7 @@ def commit_lookup(cls): 'core', 'dependencies', 'jsinterp', + 'networking', 'outtmpl', 'formats', 'plugins', diff --git a/test/test_http.py b/test/test_networking.py similarity index 100% rename from test/test_http.py rename to test/test_networking.py diff --git a/test/test_utils.py b/test/test_utils.py index bdbd2d8796..862c7d0f75 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -258,15 +258,6 @@ def test_sanitize_url(self): self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') self.assertEqual(sanitize_url('foo bar'), 'foo bar') - def test_extract_basic_auth(self): - auth_header = lambda url: sanitized_Request(url).get_header('Authorization') - self.assertFalse(auth_header('http://foo.bar')) - self.assertFalse(auth_header('http://:foo.bar')) - self.assertEqual(auth_header('http://@foo.bar'), 'Basic Og==') - self.assertEqual(auth_header('http://:pass@foo.bar'), 'Basic OnBhc3M=') - self.assertEqual(auth_header('http://user:@foo.bar'), 'Basic dXNlcjo=') - self.assertEqual(auth_header('http://user:pass@foo.bar'), 'Basic dXNlcjpwYXNz') - def test_expand_path(self): def env(var): return f'%{var}%' if sys.platform == 'win32' else f'${var}' @@ -2324,6 +2315,15 @@ def test_traverse_obj(self): self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'], msg='function on a `re.Match` should give group name as well') + def test_extract_basic_auth(self): + auth_header = lambda url: sanitized_Request(url).get_header('Authorization') + self.assertFalse(auth_header('http://foo.bar')) + self.assertFalse(auth_header('http://:foo.bar')) + self.assertEqual(auth_header('http://@foo.bar'), 'Basic Og==') + self.assertEqual(auth_header('http://:pass@foo.bar'), 'Basic OnBhc3M=') + self.assertEqual(auth_header('http://user:@foo.bar'), 'Basic dXNlcjo=') + self.assertEqual(auth_header('http://user:pass@foo.bar'), 'Basic dXNlcjpwYXNz') + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7f55716669..138646ebfc 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -151,6 +151,7 @@ write_json_file, write_string, ) +from .utils.networking import clean_headers from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__ if compat_os_name == 'nt': @@ -672,6 +673,7 @@ def process_color_policy(stream): raise self.params['compat_opts'] = set(self.params.get('compat_opts', ())) + self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() @@ -745,9 +747,6 @@ def check_deprecated(param, option, suggestion): else self.params['format'] if callable(self.params['format']) else self.build_format_selector(self.params['format'])) - # Set http_headers defaults according to std_headers - self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) - hooks = { 'post_hooks': self.add_post_hook, 'progress_hooks': self.add_progress_hook, @@ -941,12 +940,14 @@ def __enter__(self): self.save_console_title() return self - def __exit__(self, *args): - self.restore_console_title() - + def save_cookies(self): if self.params.get('cookiefile') is not None: self.cookiejar.save(ignore_discard=True, ignore_expires=True) + def __exit__(self, *args): + self.restore_console_title() + self.save_cookies() + def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. @@ -2468,9 +2469,7 @@ def restore_last_token(self): def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - if 'Youtubedl-No-Compression' in res: # deprecated - res.pop('Youtubedl-No-Compression', None) - res['Accept-Encoding'] = 'identity' + clean_headers(res) cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) if cookies: encoder = LenientSimpleCookie() @@ -3856,12 +3855,6 @@ def list_thumbnails(self, info_dict): def list_subtitles(self, video_id, subtitles, name='subtitles'): self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles) - def urlopen(self, req): - """ Start an HTTP download """ - if isinstance(req, str): - req = sanitized_Request(req) - return self._opener.open(req, timeout=self._socket_timeout) - def print_debug_header(self): if not self.params.get('verbose'): return @@ -3989,13 +3982,8 @@ def _setup_opener(self): return timeout_val = self.params.get('socket_timeout') self._socket_timeout = 20 if timeout_val is None else float(timeout_val) - - opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser') - opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') - self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self) - cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': @@ -4037,6 +4025,18 @@ def file_open(*args, **kwargs): opener.addheaders = [] self._opener = opener + @functools.cached_property + def cookiejar(self): + """Global cookiejar instance""" + return load_cookies( + self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + + def urlopen(self, req): + """ Start an HTTP download """ + if isinstance(req, str): + req = sanitized_Request(req) + return self._opener.open(req, timeout=self._socket_timeout) + def encode(self, s): if isinstance(s, bytes): return s # Already encoded diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py new file mode 100644 index 0000000000..367f3f4447 --- /dev/null +++ b/yt_dlp/networking/_helper.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import contextlib +import ssl +import sys +import urllib.parse + +from ..dependencies import certifi +from ..socks import ProxyType +from ..utils import YoutubeDLError + + +def ssl_load_certs(context: ssl.SSLContext, use_certifi=True): + if certifi and use_certifi: + context.load_verify_locations(cafile=certifi.where()) + else: + try: + context.load_default_certs() + # Work around the issue in load_default_certs when there are bad certificates. See: + # https://github.com/yt-dlp/yt-dlp/issues/1060, + # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 + except ssl.SSLError: + # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 + if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): + for storename in ('CA', 'ROOT'): + _ssl_load_windows_store_certs(context, storename) + context.set_default_verify_paths() + + +def _ssl_load_windows_store_certs(ssl_context, storename): + # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py + try: + certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) + if encoding == 'x509_asn' and ( + trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)] + except PermissionError: + return + for cert in certs: + with contextlib.suppress(ssl.SSLError): + ssl_context.load_verify_locations(cadata=cert) + + +def make_socks_proxy_opts(socks_proxy): + url_components = urllib.parse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A + + def unquote_if_non_empty(s): + if not s: + return s + return urllib.parse.unquote_plus(s) + return { + 'proxytype': socks_type, + 'addr': url_components.hostname, + 'port': url_components.port or 1080, + 'rdns': True, + 'username': unquote_if_non_empty(url_components.username), + 'password': unquote_if_non_empty(url_components.password), + } + + +def get_redirect_method(method, status): + """Unified redirect method handling""" + + # A 303 must either use GET or HEAD for subsequent request + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 + if status == 303 and method != 'HEAD': + method = 'GET' + # 301 and 302 redirects are commonly turned into a GET from a POST + # for subsequent requests by browsers, so we'll do the same. + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 + if status in (301, 302) and method == 'POST': + method = 'GET' + return method + + +def make_ssl_context( + verify=True, + client_certificate=None, + client_certificate_key=None, + client_certificate_password=None, + legacy_support=False, + use_certifi=True, +): + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname = verify + context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE + + # Some servers may reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/yt-dlp/yt-dlp/issues/3878 + with contextlib.suppress(NotImplementedError): + context.set_alpn_protocols(['http/1.1']) + if verify: + ssl_load_certs(context, use_certifi) + + if legacy_support: + context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT + context.set_ciphers('DEFAULT') # compat + + elif ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) and not ssl.OPENSSL_VERSION.startswith('LibreSSL'): + # Use the default SSL ciphers and minimum TLS version settings from Python 3.10 [1]. + # This is to ensure consistent behavior across Python versions and libraries, and help avoid fingerprinting + # in some situations [2][3]. + # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely + # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe. + # LibreSSL is excluded until further investigation due to cipher support issues [5][6]. + # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536 + # 2. https://github.com/yt-dlp/yt-dlp/issues/4627 + # 3. https://github.com/yt-dlp/yt-dlp/pull/5294 + # 4. https://peps.python.org/pep-0644/ + # 5. https://peps.python.org/pep-0644/#libressl-support + # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368 + context.set_ciphers( + '@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM') + context.minimum_version = ssl.TLSVersion.TLSv1_2 + + if client_certificate: + try: + context.load_cert_chain( + client_certificate, keyfile=client_certificate_key, + password=client_certificate_password) + except ssl.SSLError: + raise YoutubeDLError('Unable to load client certificate') + + return context + + +def add_accept_encoding_header(headers, supported_encodings): + if supported_encodings and 'Accept-Encoding' not in headers: + headers['Accept-Encoding'] = ', '.join(supported_encodings) + + elif 'Accept-Encoding' not in headers: + headers['Accept-Encoding'] = 'identity' diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py new file mode 100644 index 0000000000..1f5871ae67 --- /dev/null +++ b/yt_dlp/networking/_urllib.py @@ -0,0 +1,315 @@ +import functools +import gzip +import http.client +import io +import socket +import ssl +import urllib.error +import urllib.parse +import urllib.request +import urllib.response +import zlib + +from ._helper import ( + add_accept_encoding_header, + get_redirect_method, + make_socks_proxy_opts, +) +from ..dependencies import brotli +from ..socks import sockssocket +from ..utils import escape_url, update_url_query +from ..utils.networking import clean_headers, std_headers + +SUPPORTED_ENCODINGS = ['gzip', 'deflate'] + +if brotli: + SUPPORTED_ENCODINGS.append('br') + + +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + hc = http_class(*args, **kwargs) + source_address = ydl_handler._params.get('source_address') + + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise OSError( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except OSError as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise OSError('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection + hc.source_address = (source_address, 0) + + return hc + + +class HTTPHandler(urllib.request.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped, deflated and + brotli responses from web servers. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + def __init__(self, params, *args, **kwargs): + urllib.request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + conn_class = http.client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, False), + req) + + @staticmethod + def deflate(data): + if not data: + return data + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def brotli(data): + if not data: + return data + return brotli.decompress(data) + + @staticmethod + def gz(data): + gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') + try: + return gz.read() + except OSError as original_oserror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') + return gz.read() + except OSError: + continue + else: + raise original_oserror + + def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req = update_Request(req, url=url_escaped) + + for h, v in self._params.get('http_headers', std_headers).items(): + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: + req.add_header(h, v) + + clean_headers(req.headers) + add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS) + return super().do_request_(req) + + def http_response(self, req, resp): + old_resp = resp + + # Content-Encoding header lists the encodings in order that they were applied [1]. + # To decompress, we simply do the reverse. + # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding + decoded_response = None + for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): + if encoding == 'gzip': + decoded_response = self.gz(decoded_response or resp.read()) + elif encoding == 'deflate': + decoded_response = self.deflate(decoded_response or resp.read()) + elif encoding == 'br' and brotli: + decoded_response = self.brotli(decoded_response or resp.read()) + + if decoded_response is not None: + resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/ytdl-org/youtube-dl/issues/6457). + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + location = location.encode('iso-8859-1').decode() + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped + return resp + + https_request = http_request + https_response = http_response + + +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + http.client.HTTPConnection, http.client.HTTPSConnection)) + + proxy_args = make_socks_proxy_opts(socks_proxy) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(**proxy_args) + if isinstance(self.timeout, (int, float)): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, http.client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + +class RedirectHandler(urllib.request.HTTPRedirectHandler): + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler fixes and improves the logic to better align with RFC7261 + and what browsers tend to do [2][3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://datatracker.ietf.org/doc/html/rfc7231 + 3. https://github.com/python/cpython/issues/91306 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + if code not in (301, 302, 303, 307, 308): + raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) + + new_data = req.data + + # Technically the Cookie header should be in unredirected_hdrs, + # however in practice some may set it in normal headers anyway. + # We will remove it here to prevent any leaks. + remove_headers = ['Cookie'] + + new_method = get_redirect_method(req.get_method(), code) + # only remove payload if method changed (e.g. POST to GET) + if new_method != req.get_method(): + new_data = None + remove_headers.extend(['Content-Length', 'Content-Type']) + + new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} + + return urllib.request.Request( + newurl, headers=new_headers, origin_req_host=req.origin_req_host, + unverifiable=True, method=new_method, data=new_data) + + +class ProxyHandler(urllib.request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + urllib.request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # yt-dlp's http/https handlers do wrapping the socket with socks + return None + return urllib.request.ProxyHandler.proxy_open( + self, req, proxy, type) + + +class PUTRequest(urllib.request.Request): + def get_method(self): + return 'PUT' + + +class HEADRequest(urllib.request.Request): + def get_method(self): + return 'HEAD' + + +def update_Request(req, url=None, data=None, headers=None, query=None): + req_headers = req.headers.copy() + req_headers.update(headers or {}) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = urllib.request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py new file mode 100644 index 0000000000..89b484a220 --- /dev/null +++ b/yt_dlp/networking/exceptions.py @@ -0,0 +1,9 @@ +import http.client +import socket +import ssl +import urllib.error + +network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] +if hasattr(ssl, 'CertificateError'): + network_exceptions.append(ssl.CertificateError) +network_exceptions = tuple(network_exceptions) diff --git a/yt_dlp/utils/__init__.py b/yt_dlp/utils/__init__.py index 2dd20ada25..0b00adddb4 100644 --- a/yt_dlp/utils/__init__.py +++ b/yt_dlp/utils/__init__.py @@ -3,13 +3,10 @@ from ..compat.compat_utils import passthrough_module -# XXX: Implement this the same way as other DeprecationWarnings without circular import -passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( - DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5)) +passthrough_module(__name__, '._deprecated') del passthrough_module # isort: off from .traversal import * from ._utils import * from ._utils import _configuration_args, _get_exe_version_output -from ._deprecated import * diff --git a/yt_dlp/utils/_deprecated.py b/yt_dlp/utils/_deprecated.py index 4454d84a72..ca0fb1614d 100644 --- a/yt_dlp/utils/_deprecated.py +++ b/yt_dlp/utils/_deprecated.py @@ -1,7 +1,26 @@ """Deprecated - New code should avoid these""" +import warnings + +from ..compat.compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6)) +del passthrough_module + from ._utils import preferredencoding +# isort: split +from ..networking._urllib import PUTRequest # noqa: F401 +from ..networking._urllib import SUPPORTED_ENCODINGS, HEADRequest # noqa: F401 +from ..networking._urllib import HTTPHandler as YoutubeDLHandler # noqa: F401 +from ..networking._urllib import ProxyHandler as PerRequestProxyHandler # noqa: F401 +from ..networking._urllib import RedirectHandler as YoutubeDLRedirectHandler # noqa: F401 +from ..networking._urllib import make_socks_conn_class, update_Request # noqa: F401 +from ..networking.exceptions import network_exceptions # noqa: F401 +from .networking import random_user_agent, std_headers # noqa: F401 + def encodeFilename(s, for_subprocess=False): assert isinstance(s, str) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 4af955743d..d5704cadca 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -11,7 +11,6 @@ import email.header import email.utils import errno -import gzip import hashlib import hmac import html.entities @@ -46,7 +45,6 @@ import urllib.parse import urllib.request import xml.etree.ElementTree -import zlib from . import traversal @@ -58,8 +56,7 @@ compat_os_name, compat_shlex_quote, ) -from ..dependencies import brotli, certifi, websockets, xattr -from ..socks import ProxyType, sockssocket +from ..dependencies import websockets, xattr __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module @@ -67,65 +64,6 @@ compiled_regex_type = type(re.compile('')) -def random_user_agent(): - _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' - _CHROME_VERSIONS = ( - '90.0.4430.212', - '90.0.4430.24', - '90.0.4430.70', - '90.0.4430.72', - '90.0.4430.85', - '90.0.4430.93', - '91.0.4472.101', - '91.0.4472.106', - '91.0.4472.114', - '91.0.4472.124', - '91.0.4472.164', - '91.0.4472.19', - '91.0.4472.77', - '92.0.4515.107', - '92.0.4515.115', - '92.0.4515.131', - '92.0.4515.159', - '92.0.4515.43', - '93.0.4556.0', - '93.0.4577.15', - '93.0.4577.63', - '93.0.4577.82', - '94.0.4606.41', - '94.0.4606.54', - '94.0.4606.61', - '94.0.4606.71', - '94.0.4606.81', - '94.0.4606.85', - '95.0.4638.17', - '95.0.4638.50', - '95.0.4638.54', - '95.0.4638.69', - '95.0.4638.74', - '96.0.4664.18', - '96.0.4664.45', - '96.0.4664.55', - '96.0.4664.93', - '97.0.4692.20', - ) - return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) - - -SUPPORTED_ENCODINGS = [ - 'gzip', 'deflate' -] -if brotli: - SUPPORTED_ENCODINGS.append('br') - -std_headers = { - 'User-Agent': random_user_agent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-us,en;q=0.5', - 'Sec-Fetch-Mode': 'navigate', -} - - USER_AGENTS = { 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', } @@ -958,80 +896,16 @@ def formatSeconds(secs, delim=':', msec=False): return '%s.%03d' % (ret, time.milliseconds) if msec else ret -def _ssl_load_windows_store_certs(ssl_context, storename): - # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py - try: - certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) - if encoding == 'x509_asn' and ( - trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)] - except PermissionError: - return - for cert in certs: - with contextlib.suppress(ssl.SSLError): - ssl_context.load_verify_locations(cadata=cert) - - def make_HTTPS_handler(params, **kwargs): - opts_check_certificate = not params.get('nocheckcertificate') - context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - context.check_hostname = opts_check_certificate - if params.get('legacyserverconnect'): - context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT - # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998 - context.set_ciphers('DEFAULT') - elif ( - sys.version_info < (3, 10) - and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) - and not ssl.OPENSSL_VERSION.startswith('LibreSSL') - ): - # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1]. - # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting - # in some situations [2][3]. - # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely - # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe. - # LibreSSL is excluded until further investigation due to cipher support issues [5][6]. - # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536 - # 2. https://github.com/yt-dlp/yt-dlp/issues/4627 - # 3. https://github.com/yt-dlp/yt-dlp/pull/5294 - # 4. https://peps.python.org/pep-0644/ - # 5. https://peps.python.org/pep-0644/#libressl-support - # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368 - context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM') - context.minimum_version = ssl.TLSVersion.TLSv1_2 - - context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE - if opts_check_certificate: - if certifi and 'no-certifi' not in params.get('compat_opts', []): - context.load_verify_locations(cafile=certifi.where()) - else: - try: - context.load_default_certs() - # Work around the issue in load_default_certs when there are bad certificates. See: - # https://github.com/yt-dlp/yt-dlp/issues/1060, - # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 - except ssl.SSLError: - # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 - if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): - for storename in ('CA', 'ROOT'): - _ssl_load_windows_store_certs(context, storename) - context.set_default_verify_paths() - - client_certfile = params.get('client_certificate') - if client_certfile: - try: - context.load_cert_chain( - client_certfile, keyfile=params.get('client_certificate_key'), - password=params.get('client_certificate_password')) - except ssl.SSLError: - raise YoutubeDLError('Unable to load client certificate') - - # Some servers may reject requests if ALPN extension is not sent. See: - # https://github.com/python/cpython/issues/85140 - # https://github.com/yt-dlp/yt-dlp/issues/3878 - with contextlib.suppress(NotImplementedError): - context.set_alpn_protocols(['http/1.1']) - - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + from ..networking._helper import make_ssl_context + return YoutubeDLHTTPSHandler(params, context=make_ssl_context( + verify=not params.get('nocheckcertificate'), + client_certificate=params.get('client_certificate'), + client_certificate_key=params.get('client_certificate_key'), + client_certificate_password=params.get('client_certificate_password'), + legacy_support=params.get('legacyserverconnect'), + use_certifi='no-certifi' not in params.get('compat_opts', []), + ), **kwargs) def bug_reports_message(before=';'): @@ -1059,12 +933,6 @@ def __init__(self, msg=None): super().__init__(self.msg) -network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] -if hasattr(ssl, 'CertificateError'): - network_exceptions.append(ssl.CertificateError) -network_exceptions = tuple(network_exceptions) - - class ExtractorError(YoutubeDLError): """Error during info extraction.""" @@ -1072,6 +940,7 @@ def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=N """ tb, if given, is the original traceback (so that it can be printed out). If expected is set, this is a normal error message and most likely not a bug in yt-dlp. """ + from ..networking.exceptions import network_exceptions if sys.exc_info()[0] in network_exceptions: expected = True @@ -1271,225 +1140,6 @@ class XAttrUnavailableError(YoutubeDLError): pass -def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): - hc = http_class(*args, **kwargs) - source_address = ydl_handler._params.get('source_address') - - if source_address is not None: - # This is to workaround _create_connection() from socket where it will try all - # address data from getaddrinfo() including IPv6. This filters the result from - # getaddrinfo() based on the source_address value. - # This is based on the cpython socket.create_connection() function. - # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 - def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): - host, port = address - err = None - addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 - ip_addrs = [addr for addr in addrs if addr[0] == af] - if addrs and not ip_addrs: - ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise OSError( - "No remote IP%s addresses available for connect, can't use '%s' as source address" - % (ip_version, source_address[0])) - for res in ip_addrs: - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - sock.settimeout(timeout) - sock.bind(source_address) - sock.connect(sa) - err = None # Explicitly break reference cycle - return sock - except OSError as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise OSError('getaddrinfo returns an empty list') - if hasattr(hc, '_create_connection'): - hc._create_connection = _create_connection - hc.source_address = (source_address, 0) - - return hc - - -class YoutubeDLHandler(urllib.request.HTTPHandler): - """Handler for HTTP requests and responses. - - This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped, deflated and - brotli responses from web servers. - - Part of this code was copied from: - - http://techknack.net/python-urllib2-handlers/ - - Andrew Rowls, the author of that code, agreed to release it to the - public domain. - """ - - def __init__(self, params, *args, **kwargs): - urllib.request.HTTPHandler.__init__(self, *args, **kwargs) - self._params = params - - def http_open(self, req): - conn_class = http.client.HTTPConnection - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, False), - req) - - @staticmethod - def deflate(data): - if not data: - return data - try: - return zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error: - return zlib.decompress(data) - - @staticmethod - def brotli(data): - if not data: - return data - return brotli.decompress(data) - - @staticmethod - def gz(data): - gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') - try: - return gz.read() - except OSError as original_oserror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') - return gz.read() - except OSError: - continue - else: - raise original_oserror - - def http_request(self, req): - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) - # the code of this workaround has been moved here from YoutubeDL.urlopen() - url = req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - req = update_Request(req, url=url_escaped) - - for h, v in self._params.get('http_headers', std_headers).items(): - # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 - # The dict keys are capitalized because of this bug by urllib - if h.capitalize() not in req.headers: - req.add_header(h, v) - - if 'Youtubedl-no-compression' in req.headers: # deprecated - req.headers.pop('Youtubedl-no-compression', None) - req.add_header('Accept-encoding', 'identity') - - if 'Accept-encoding' not in req.headers: - req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS)) - - return super().do_request_(req) - - def http_response(self, req, resp): - old_resp = resp - - # Content-Encoding header lists the encodings in order that they were applied [1]. - # To decompress, we simply do the reverse. - # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding - decoded_response = None - for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): - if encoding == 'gzip': - decoded_response = self.gz(decoded_response or resp.read()) - elif encoding == 'deflate': - decoded_response = self.deflate(decoded_response or resp.read()) - elif encoding == 'br' and brotli: - decoded_response = self.brotli(decoded_response or resp.read()) - - if decoded_response is not None: - resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see - # https://github.com/ytdl-org/youtube-dl/issues/6457). - if 300 <= resp.code < 400: - location = resp.headers.get('Location') - if location: - # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 - location = location.encode('iso-8859-1').decode() - location_escaped = escape_url(location) - if location != location_escaped: - del resp.headers['Location'] - resp.headers['Location'] = location_escaped - return resp - - https_request = http_request - https_response = http_response - - -def make_socks_conn_class(base_class, socks_proxy): - assert issubclass(base_class, ( - http.client.HTTPConnection, http.client.HTTPSConnection)) - - url_components = urllib.parse.urlparse(socks_proxy) - if url_components.scheme.lower() == 'socks5': - socks_type = ProxyType.SOCKS5 - elif url_components.scheme.lower() in ('socks', 'socks4'): - socks_type = ProxyType.SOCKS4 - elif url_components.scheme.lower() == 'socks4a': - socks_type = ProxyType.SOCKS4A - - def unquote_if_non_empty(s): - if not s: - return s - return urllib.parse.unquote_plus(s) - - proxy_args = ( - socks_type, - url_components.hostname, url_components.port or 1080, - True, # Remote DNS - unquote_if_non_empty(url_components.username), - unquote_if_non_empty(url_components.password), - ) - - class SocksConnection(base_class): - def connect(self): - self.sock = sockssocket() - self.sock.setproxy(*proxy_args) - if isinstance(self.timeout, (int, float)): - self.sock.settimeout(self.timeout) - self.sock.connect((self.host, self.port)) - - if isinstance(self, http.client.HTTPSConnection): - if hasattr(self, '_context'): # Python > 2.6 - self.sock = self._context.wrap_socket( - self.sock, server_hostname=self.host) - else: - self.sock = ssl.wrap_socket(self.sock) - - return SocksConnection - - class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler): def __init__(self, params, https_conn_class=None, *args, **kwargs): urllib.request.HTTPSHandler.__init__(self, *args, **kwargs) @@ -1507,9 +1157,11 @@ def https_open(self, req): socks_proxy = req.headers.get('Ytdl-socks-proxy') if socks_proxy: + from ..networking._urllib import make_socks_conn_class conn_class = make_socks_conn_class(conn_class, socks_proxy) del req.headers['Ytdl-socks-proxy'] + from ..networking._urllib import _create_http_connection try: return self.do_open( functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs) @@ -1535,56 +1187,6 @@ def http_response(self, request, response): https_response = http_response -class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): - """YoutubeDL redirect handler - - The code is based on HTTPRedirectHandler implementation from CPython [1]. - - This redirect handler fixes and improves the logic to better align with RFC7261 - and what browsers tend to do [2][3] - - 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py - 2. https://datatracker.ietf.org/doc/html/rfc7231 - 3. https://github.com/python/cpython/issues/91306 - """ - - http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 - - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code not in (301, 302, 303, 307, 308): - raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) - - new_method = req.get_method() - new_data = req.data - - # Technically the Cookie header should be in unredirected_hdrs, - # however in practice some may set it in normal headers anyway. - # We will remove it here to prevent any leaks. - remove_headers = ['Cookie'] - - # A 303 must either use GET or HEAD for subsequent request - # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 - if code == 303 and req.get_method() != 'HEAD': - new_method = 'GET' - # 301 and 302 redirects are commonly turned into a GET from a POST - # for subsequent requests by browsers, so we'll do the same. - # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 - # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 - elif code in (301, 302) and req.get_method() == 'POST': - new_method = 'GET' - - # only remove payload if method changed (e.g. POST to GET) - if new_method != req.get_method(): - new_data = None - remove_headers.extend(['Content-Length', 'Content-Type']) - - new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} - - return urllib.request.Request( - newurl, headers=new_headers, origin_req_host=req.origin_req_host, - unverifiable=True, method=new_method, data=new_data) - - def extract_timezone(date_str): m = re.search( r'''(?x) @@ -2390,16 +1992,6 @@ def urljoin(base, path): return urllib.parse.urljoin(base, path) -class HEADRequest(urllib.request.Request): - def get_method(self): - return 'HEAD' - - -class PUTRequest(urllib.request.Request): - def get_method(self): - return 'PUT' - - def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr and v is not None: v = getattr(v, get_attr, None) @@ -3016,26 +2608,6 @@ def update_url_query(url, query): return update_url(url, query_update=query) -def update_Request(req, url=None, data=None, headers=None, query=None): - req_headers = req.headers.copy() - req_headers.update(headers or {}) - req_data = data or req.data - req_url = update_url_query(url or req.get_full_url(), query) - req_get_method = req.get_method() - if req_get_method == 'HEAD': - req_type = HEADRequest - elif req_get_method == 'PUT': - req_type = PUTRequest - else: - req_type = urllib.request.Request - new_req = req_type( - req_url, data=req_data, headers=req_headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - if hasattr(req, 'timeout'): - new_req.timeout = req.timeout - return new_req - - def _multipart_encode_impl(data, boundary): content_type = 'multipart/form-data; boundary=%s' % boundary @@ -4769,31 +4341,6 @@ def random_ipv4(cls, code_or_block): struct.pack('!L', random.randint(addr_min, addr_max)))) -class PerRequestProxyHandler(urllib.request.ProxyHandler): - def __init__(self, proxies=None): - # Set default handlers - for type in ('http', 'https'): - setattr(self, '%s_open' % type, - lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: - meth(r, proxy, type)) - urllib.request.ProxyHandler.__init__(self, proxies) - - def proxy_open(self, req, proxy, type): - req_proxy = req.headers.get('Ytdl-request-proxy') - if req_proxy is not None: - proxy = req_proxy - del req.headers['Ytdl-request-proxy'] - - if proxy == '__noproxy__': - return None # No Proxy - if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): - req.add_header('Ytdl-socks-proxy', proxy) - # yt-dlp's http/https handlers do wrapping the socket with socks - return None - return urllib.request.ProxyHandler.proxy_open( - self, req, proxy, type) - - # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is # released into Public Domain # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py new file mode 100644 index 0000000000..95b54fabef --- /dev/null +++ b/yt_dlp/utils/networking.py @@ -0,0 +1,60 @@ +import random + + +def random_user_agent(): + _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' + _CHROME_VERSIONS = ( + '90.0.4430.212', + '90.0.4430.24', + '90.0.4430.70', + '90.0.4430.72', + '90.0.4430.85', + '90.0.4430.93', + '91.0.4472.101', + '91.0.4472.106', + '91.0.4472.114', + '91.0.4472.124', + '91.0.4472.164', + '91.0.4472.19', + '91.0.4472.77', + '92.0.4515.107', + '92.0.4515.115', + '92.0.4515.131', + '92.0.4515.159', + '92.0.4515.43', + '93.0.4556.0', + '93.0.4577.15', + '93.0.4577.63', + '93.0.4577.82', + '94.0.4606.41', + '94.0.4606.54', + '94.0.4606.61', + '94.0.4606.71', + '94.0.4606.81', + '94.0.4606.85', + '95.0.4638.17', + '95.0.4638.50', + '95.0.4638.54', + '95.0.4638.69', + '95.0.4638.74', + '96.0.4664.18', + '96.0.4664.45', + '96.0.4664.55', + '96.0.4664.93', + '97.0.4692.20', + ) + return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) + + +std_headers = { + 'User-Agent': random_user_agent(), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-us,en;q=0.5', + 'Sec-Fetch-Mode': 'navigate', +} + + +def clean_headers(headers): + if 'Youtubedl-no-compression' in headers: # compat + del headers['Youtubedl-no-compression'] + headers['Accept-Encoding'] = 'identity' From 227bf1a33be7b89cd7d44ad046844c4ccba104f4 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 15 Jul 2023 15:55:23 +0530 Subject: [PATCH 275/501] [networking] Rewrite architecture (#2861) New networking interface consists of a `RequestDirector` that directs each `Request` to appropriate `RequestHandler` and returns the `Response` or raises `RequestError`. The handlers define adapters to transform its internal Request/Response/Errors to our interfaces. User-facing changes: - Fix issues with per request proxies on redirects for urllib - Support for `ALL_PROXY` environment variable for proxy setting - Support for `socks5h` proxy - Closes https://github.com/yt-dlp/yt-dlp/issues/6325, https://github.com/ytdl-org/youtube-dl/issues/22618, https://github.com/ytdl-org/youtube-dl/pull/28093 - Raise error when using `https` proxy instead of silently converting it to `http` Authored by: coletdjnz --- test/test_download.py | 9 +- test/test_networking.py | 1351 +++++++++++++++++++++++++------ test/test_networking_utils.py | 239 ++++++ test/test_utils.py | 46 +- yt_dlp/YoutubeDL.py | 175 ++-- yt_dlp/compat/__init__.py | 10 + yt_dlp/downloader/http.py | 24 +- yt_dlp/extractor/common.py | 32 +- yt_dlp/networking/__init__.py | 13 + yt_dlp/networking/_helper.py | 91 ++- yt_dlp/networking/_urllib.py | 231 +++++- yt_dlp/networking/common.py | 522 ++++++++++++ yt_dlp/networking/exceptions.py | 202 ++++- yt_dlp/utils/_deprecated.py | 13 +- yt_dlp/utils/_utils.py | 35 +- yt_dlp/utils/networking.py | 67 +- 16 files changed, 2586 insertions(+), 474 deletions(-) create mode 100644 test/test_networking_utils.py create mode 100644 yt_dlp/networking/common.py diff --git a/test/test_download.py b/test/test_download.py index 43b39c36b3..fd7752cddf 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -10,10 +10,7 @@ import collections import hashlib -import http.client import json -import socket -import urllib.error from test.helper import ( assertGreaterEqual, @@ -29,6 +26,7 @@ import yt_dlp.YoutubeDL # isort: split from yt_dlp.extractor import get_info_extractor +from yt_dlp.networking.exceptions import HTTPError, TransportError from yt_dlp.utils import ( DownloadError, ExtractorError, @@ -162,8 +160,7 @@ def try_rm_tcs_files(tcs=None): force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one - if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine) - or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)): + if not isinstance(err.exc_info[1], (TransportError, UnavailableVideoError)) or (isinstance(err.exc_info[1], HTTPError) and err.exc_info[1].code == 503): err.msg = f'{getattr(err, "msg", err)} ({tname})' raise @@ -249,7 +246,7 @@ def try_rm_tcs_files(tcs=None): # extractor returns full results even with extract_flat res_tcs = [{'info_dict': e} for e in res_dict['entries']] try_rm_tcs_files(res_tcs) - + ydl.close() return test_template diff --git a/test/test_networking.py b/test/test_networking.py index e4e66dce18..147a4ff491 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -3,32 +3,74 @@ # Allow direct execution import os import sys -import unittest + +import pytest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import functools import gzip +import http.client import http.cookiejar import http.server +import inspect import io import pathlib +import random import ssl import tempfile import threading +import time import urllib.error import urllib.request +import warnings import zlib +from email.message import Message +from http.cookiejar import CookieJar -from test.helper import http_server_port -from yt_dlp import YoutubeDL +from test.helper import FakeYDL, http_server_port from yt_dlp.dependencies import brotli -from yt_dlp.utils import sanitized_Request, urlencode_postdata - -from .helper import FakeYDL +from yt_dlp.networking import ( + HEADRequest, + PUTRequest, + Request, + RequestDirector, + RequestHandler, + Response, +) +from yt_dlp.networking._urllib import UrllibRH +from yt_dlp.networking.common import _REQUEST_HANDLERS +from yt_dlp.networking.exceptions import ( + CertificateVerifyError, + HTTPError, + IncompleteRead, + NoSupportingHandlers, + RequestError, + SSLError, + TransportError, + UnsupportedRequest, +) +from yt_dlp.utils._utils import _YDLLogger as FakeLogger +from yt_dlp.utils.networking import HTTPHeaderDict TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +def _build_proxy_handler(name): + class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): + proxy_name = name + + def log_message(self, format, *args): + pass + + def do_GET(self): + self.send_response(200) + self.send_header('Content-Type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode()) + return HTTPTestRequestHandler + + class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' @@ -36,7 +78,7 @@ def log_message(self, format, *args): pass def _headers(self): - payload = str(self.headers).encode('utf-8') + payload = str(self.headers).encode() self.send_response(200) self.send_header('Content-Type', 'application/json') self.send_header('Content-Length', str(len(payload))) @@ -70,7 +112,7 @@ def _read_data(self): return self.rfile.read(int(self.headers['Content-Length'])) def do_POST(self): - data = self._read_data() + data = self._read_data() + str(self.headers).encode() if self.path.startswith('/redirect_'): self._redirect() elif self.path.startswith('/method'): @@ -89,7 +131,7 @@ def do_HEAD(self): self._status(404) def do_PUT(self): - data = self._read_data() + data = self._read_data() + str(self.headers).encode() if self.path.startswith('/redirect_'): self._redirect() elif self.path.startswith('/method'): @@ -102,7 +144,7 @@ def do_GET(self): payload = b'<html><video src="/vid.mp4" /></html>' self.send_response(200) self.send_header('Content-Type', 'text/html; charset=utf-8') - self.send_header('Content-Length', str(len(payload))) # required for persistent connections + self.send_header('Content-Length', str(len(payload))) self.end_headers() self.wfile.write(payload) elif self.path == '/vid.mp4': @@ -126,10 +168,15 @@ def do_GET(self): self.send_header('Content-Length', str(len(payload))) self.end_headers() self.wfile.write(payload) + elif self.path.startswith('/redirect_loop'): + self.send_response(301) + self.send_header('Location', self.path) + self.send_header('Content-Length', '0') + self.end_headers() elif self.path.startswith('/redirect_'): self._redirect() elif self.path.startswith('/method'): - self._method('GET') + self._method('GET', str(self.headers).encode()) elif self.path.startswith('/headers'): self._headers() elif self.path.startswith('/308-to-headers'): @@ -179,7 +226,32 @@ def do_GET(self): self.send_header('Content-Length', str(len(payload))) self.end_headers() self.wfile.write(payload) - + elif self.path.startswith('/gen_'): + payload = b'<html></html>' + self.send_response(int(self.path[len('/gen_'):])) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + elif self.path.startswith('/incompleteread'): + payload = b'<html></html>' + self.send_response(200) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', '234234') + self.end_headers() + self.wfile.write(payload) + self.finish() + elif self.path.startswith('/timeout_'): + time.sleep(int(self.path[len('/timeout_'):])) + self._headers() + elif self.path == '/source_address': + payload = str(self.client_address[0]).encode() + self.send_response(200) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload) + self.finish() else: self._status(404) @@ -198,334 +270,1099 @@ def send_header(self, keyword, value): self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode()) -class FakeLogger: - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass +def validate_and_send(rh, req): + rh.validate(req) + return rh.send(req) -class TestHTTP(unittest.TestCase): - def setUp(self): - # HTTP server - self.http_httpd = http.server.ThreadingHTTPServer( +class TestRequestHandlerBase: + @classmethod + def setup_class(cls): + cls.http_httpd = http.server.ThreadingHTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) - self.http_port = http_server_port(self.http_httpd) - self.http_server_thread = threading.Thread(target=self.http_httpd.serve_forever) + cls.http_port = http_server_port(cls.http_httpd) + cls.http_server_thread = threading.Thread(target=cls.http_httpd.serve_forever) # FIXME: we should probably stop the http server thread after each test # See: https://github.com/yt-dlp/yt-dlp/pull/7094#discussion_r1199746041 - self.http_server_thread.daemon = True - self.http_server_thread.start() + cls.http_server_thread.daemon = True + cls.http_server_thread.start() # HTTPS server certfn = os.path.join(TEST_DIR, 'testcert.pem') - self.https_httpd = http.server.ThreadingHTTPServer( + cls.https_httpd = http.server.ThreadingHTTPServer( ('127.0.0.1', 0), HTTPTestRequestHandler) sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.load_cert_chain(certfn, None) - self.https_httpd.socket = sslctx.wrap_socket(self.https_httpd.socket, server_side=True) - self.https_port = http_server_port(self.https_httpd) - self.https_server_thread = threading.Thread(target=self.https_httpd.serve_forever) - self.https_server_thread.daemon = True - self.https_server_thread.start() + cls.https_httpd.socket = sslctx.wrap_socket(cls.https_httpd.socket, server_side=True) + cls.https_port = http_server_port(cls.https_httpd) + cls.https_server_thread = threading.Thread(target=cls.https_httpd.serve_forever) + cls.https_server_thread.daemon = True + cls.https_server_thread.start() - def test_nocheckcertificate(self): - with FakeYDL({'logger': FakeLogger()}) as ydl: - with self.assertRaises(urllib.error.URLError): - ydl.urlopen(sanitized_Request(f'https://127.0.0.1:{self.https_port}/headers')) - with FakeYDL({'logger': FakeLogger(), 'nocheckcertificate': True}) as ydl: - r = ydl.urlopen(sanitized_Request(f'https://127.0.0.1:{self.https_port}/headers')) - self.assertEqual(r.status, 200) +@pytest.fixture +def handler(request): + RH_KEY = request.param + if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler): + handler = RH_KEY + elif RH_KEY in _REQUEST_HANDLERS: + handler = _REQUEST_HANDLERS[RH_KEY] + else: + pytest.skip(f'{RH_KEY} request handler is not available') + + return functools.partial(handler, logger=FakeLogger) + + +class TestHTTPRequestHandler(TestRequestHandlerBase): + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_verify_cert(self, handler): + with handler() as rh: + with pytest.raises(CertificateVerifyError): + validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers')) + + with handler(verify=False) as rh: + r = validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers')) + assert r.status == 200 r.close() - def test_percent_encode(self): - with FakeYDL() as ydl: + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_ssl_error(self, handler): + # HTTPS server with too old TLS version + # XXX: is there a better way to test this than to create a new server? + https_httpd = http.server.ThreadingHTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + https_httpd.socket = sslctx.wrap_socket(https_httpd.socket, server_side=True) + https_port = http_server_port(https_httpd) + https_server_thread = threading.Thread(target=https_httpd.serve_forever) + https_server_thread.daemon = True + https_server_thread.start() + + with handler(verify=False) as rh: + with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info: + validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) + assert not issubclass(exc_info.type, CertificateVerifyError) + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_percent_encode(self, handler): + with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding - res = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/中文.html')) - self.assertEqual(res.status, 200) + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/中文.html')) + assert res.status == 200 res.close() # don't normalize existing percent encodings - res = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/%c7%9f')) - self.assertEqual(res.status, 200) + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/%c7%9f')) + assert res.status == 200 res.close() - def test_unicode_path_redirection(self): - with FakeYDL() as ydl: - r = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) - self.assertEqual(r.url, f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html') + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_unicode_path_redirection(self, handler): + with handler() as rh: + r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) + assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - def test_redirect(self): - with FakeYDL() as ydl: - def do_req(redirect_status, method): + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_raise_http_error(self, handler): + with handler() as rh: + for bad_status in (400, 500, 599, 302): + with pytest.raises(HTTPError): + validate_and_send(rh, Request('http://127.0.0.1:%d/gen_%d' % (self.http_port, bad_status))) + + # Should not raise an error + validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_response_url(self, handler): + with handler() as rh: + # Response url should be that of the last url in redirect chain + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_301')) + assert res.url == f'http://127.0.0.1:{self.http_port}/method' + res.close() + res2 = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/gen_200')) + assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' + res2.close() + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_redirect(self, handler): + with handler() as rh: + def do_req(redirect_status, method, assert_no_content=False): data = b'testdata' if method in ('POST', 'PUT') else None - res = ydl.urlopen(sanitized_Request( - f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data)) - return res.read().decode('utf-8'), res.headers.get('method', '') + res = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data)) + + headers = b'' + data_sent = b'' + if data is not None: + data_sent += res.read(len(data)) + if data_sent != data: + headers += data_sent + data_sent = b'' + + headers += res.read() + + if assert_no_content or data is None: + assert b'Content-Type' not in headers + assert b'Content-Length' not in headers + else: + assert b'Content-Type' in headers + assert b'Content-Length' in headers + + return data_sent.decode(), res.headers.get('method', '') # A 303 must either use GET or HEAD for subsequent request - self.assertEqual(do_req(303, 'POST'), ('', 'GET')) - self.assertEqual(do_req(303, 'HEAD'), ('', 'HEAD')) + assert do_req(303, 'POST', True) == ('', 'GET') + assert do_req(303, 'HEAD') == ('', 'HEAD') - self.assertEqual(do_req(303, 'PUT'), ('', 'GET')) + assert do_req(303, 'PUT', True) == ('', 'GET') # 301 and 302 turn POST only into a GET - # XXX: we should also test if the Content-Type and Content-Length headers are removed - self.assertEqual(do_req(301, 'POST'), ('', 'GET')) - self.assertEqual(do_req(301, 'HEAD'), ('', 'HEAD')) - self.assertEqual(do_req(302, 'POST'), ('', 'GET')) - self.assertEqual(do_req(302, 'HEAD'), ('', 'HEAD')) + assert do_req(301, 'POST', True) == ('', 'GET') + assert do_req(301, 'HEAD') == ('', 'HEAD') + assert do_req(302, 'POST', True) == ('', 'GET') + assert do_req(302, 'HEAD') == ('', 'HEAD') - self.assertEqual(do_req(301, 'PUT'), ('testdata', 'PUT')) - self.assertEqual(do_req(302, 'PUT'), ('testdata', 'PUT')) + assert do_req(301, 'PUT') == ('testdata', 'PUT') + assert do_req(302, 'PUT') == ('testdata', 'PUT') # 307 and 308 should not change method for m in ('POST', 'PUT'): - self.assertEqual(do_req(307, m), ('testdata', m)) - self.assertEqual(do_req(308, m), ('testdata', m)) + assert do_req(307, m) == ('testdata', m) + assert do_req(308, m) == ('testdata', m) - self.assertEqual(do_req(307, 'HEAD'), ('', 'HEAD')) - self.assertEqual(do_req(308, 'HEAD'), ('', 'HEAD')) + assert do_req(307, 'HEAD') == ('', 'HEAD') + assert do_req(308, 'HEAD') == ('', 'HEAD') # These should not redirect and instead raise an HTTPError for code in (300, 304, 305, 306): - with self.assertRaises(urllib.error.HTTPError): + with pytest.raises(HTTPError): do_req(code, 'GET') - def test_content_type(self): - # https://github.com/yt-dlp/yt-dlp/commit/379a4f161d4ad3e40932dcf5aca6e6fb9715ab28 - with FakeYDL({'nocheckcertificate': True}) as ydl: - # method should be auto-detected as POST - r = sanitized_Request(f'https://localhost:{self.https_port}/headers', data=urlencode_postdata({'test': 'test'})) - - headers = ydl.urlopen(r).read().decode('utf-8') - self.assertIn('Content-Type: application/x-www-form-urlencoded', headers) - - # test http - r = sanitized_Request(f'http://localhost:{self.http_port}/headers', data=urlencode_postdata({'test': 'test'})) - headers = ydl.urlopen(r).read().decode('utf-8') - self.assertIn('Content-Type: application/x-www-form-urlencoded', headers) - - def test_cookiejar(self): - with FakeYDL() as ydl: - ydl.cookiejar.set_cookie(http.cookiejar.Cookie( - 0, 'test', 'ytdlp', None, False, '127.0.0.1', True, - False, '/headers', True, False, None, False, None, None, {})) - data = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers')).read() - self.assertIn(b'Cookie: test=ytdlp', data) - - def test_passed_cookie_header(self): + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. - with FakeYDL() as ydl: + with handler() as rh: # Specified Cookie header should be used - res = ydl.urlopen( - sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers', - headers={'Cookie': 'test=test'})).read().decode('utf-8') - self.assertIn('Cookie: test=test', res) + res = validate_and_send( + rh, Request( + f'http://127.0.0.1:{self.http_port}/headers', + headers={'Cookie': 'test=test'})).read().decode() + assert 'Cookie: test=test' in res # Specified Cookie header should be removed on any redirect - res = ydl.urlopen( - sanitized_Request(f'http://127.0.0.1:{self.http_port}/308-to-headers', headers={'Cookie': 'test=test'})).read().decode('utf-8') - self.assertNotIn('Cookie: test=test', res) + res = validate_and_send( + rh, Request( + f'http://127.0.0.1:{self.http_port}/308-to-headers', + headers={'Cookie': 'test=test'})).read().decode() + assert 'Cookie: test=test' not in res - # Specified Cookie header should override global cookiejar for that request - ydl.cookiejar.set_cookie(http.cookiejar.Cookie( - version=0, name='test', value='ytdlp', port=None, port_specified=False, - domain='127.0.0.1', domain_specified=True, domain_initial_dot=False, path='/', - path_specified=True, secure=False, expires=None, discard=False, comment=None, - comment_url=None, rest={})) + # Specified Cookie header should override global cookiejar for that request + cookiejar = http.cookiejar.CookieJar() + cookiejar.set_cookie(http.cookiejar.Cookie( + version=0, name='test', value='ytdlp', port=None, port_specified=False, + domain='127.0.0.1', domain_specified=True, domain_initial_dot=False, path='/', + path_specified=True, secure=False, expires=None, discard=False, comment=None, + comment_url=None, rest={})) - data = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'Cookie': 'test=test'})).read() - self.assertNotIn(b'Cookie: test=ytdlp', data) - self.assertIn(b'Cookie: test=test', data) + with handler(cookiejar=cookiejar) as rh: + data = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', headers={'cookie': 'test=test'})).read() + assert b'Cookie: test=ytdlp' not in data + assert b'Cookie: test=test' in data - def test_no_compression_compat_header(self): - with FakeYDL() as ydl: - data = ydl.urlopen( - sanitized_Request( - f'http://127.0.0.1:{self.http_port}/headers', - headers={'Youtubedl-no-compression': True})).read() - self.assertIn(b'Accept-Encoding: identity', data) - self.assertNotIn(b'youtubedl-no-compression', data.lower()) + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_redirect_loop(self, handler): + with handler() as rh: + with pytest.raises(HTTPError, match='redirect loop'): + validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - def test_gzip_trailing_garbage(self): - # https://github.com/ytdl-org/youtube-dl/commit/aa3e950764337ef9800c936f4de89b31c00dfcf5 - # https://github.com/ytdl-org/youtube-dl/commit/6f2ec15cee79d35dba065677cad9da7491ec6e6f - with FakeYDL() as ydl: - data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8') - self.assertEqual(data, '<html><video src="/vid.mp4" /></html>') + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_incompleteread(self, handler): + with handler(timeout=2) as rh: + with pytest.raises(IncompleteRead): + validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @unittest.skipUnless(brotli, 'brotli support is not installed') - def test_brotli(self): - with FakeYDL() as ydl: - res = ydl.urlopen( - sanitized_Request( + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_cookies(self, handler): + cookiejar = http.cookiejar.CookieJar() + cookiejar.set_cookie(http.cookiejar.Cookie( + 0, 'test', 'ytdlp', None, False, '127.0.0.1', True, + False, '/headers', True, False, None, False, None, None, {})) + + with handler(cookiejar=cookiejar) as rh: + data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read() + assert b'Cookie: test=ytdlp' in data + + # Per request + with handler() as rh: + data = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() + assert b'Cookie: test=ytdlp' in data + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_headers(self, handler): + + with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: + # Global Headers + data = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/headers')).read() + assert b'Test1: test' in data + + # Per request headers, merged with global + data = validate_and_send(rh, Request( + f'http://127.0.0.1:{self.http_port}/headers', headers={'test2': 'changed', 'test3': 'test3'})).read() + assert b'Test1: test' in data + assert b'Test2: changed' in data + assert b'Test2: test2' not in data + assert b'Test3: test3' in data + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_timeout(self, handler): + with handler() as rh: + # Default timeout is 20 seconds, so this should go through + validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_3')) + + with handler(timeout=0.5) as rh: + with pytest.raises(TransportError): + validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1')) + + # Per request timeout, should override handler timeout + validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_source_address(self, handler): + source_address = f'127.0.0.{random.randint(5, 255)}' + with handler(source_address=source_address) as rh: + data = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() + assert source_address == data + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_gzip_trailing_garbage(self, handler): + with handler() as rh: + data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() + assert data == '<html><video src="/vid.mp4" /></html>' + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.skipif(not brotli, reason='brotli support is not installed') + def test_brotli(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request( f'http://127.0.0.1:{self.http_port}/content-encoding', headers={'ytdl-encoding': 'br'})) - self.assertEqual(res.headers.get('Content-Encoding'), 'br') - self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + assert res.headers.get('Content-Encoding') == 'br' + assert res.read() == b'<html><video src="/vid.mp4" /></html>' - def test_deflate(self): - with FakeYDL() as ydl: - res = ydl.urlopen( - sanitized_Request( + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_deflate(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request( f'http://127.0.0.1:{self.http_port}/content-encoding', headers={'ytdl-encoding': 'deflate'})) - self.assertEqual(res.headers.get('Content-Encoding'), 'deflate') - self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + assert res.headers.get('Content-Encoding') == 'deflate' + assert res.read() == b'<html><video src="/vid.mp4" /></html>' - def test_gzip(self): - with FakeYDL() as ydl: - res = ydl.urlopen( - sanitized_Request( + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_gzip(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request( f'http://127.0.0.1:{self.http_port}/content-encoding', headers={'ytdl-encoding': 'gzip'})) - self.assertEqual(res.headers.get('Content-Encoding'), 'gzip') - self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + assert res.headers.get('Content-Encoding') == 'gzip' + assert res.read() == b'<html><video src="/vid.mp4" /></html>' - def test_multiple_encodings(self): - # https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4 - with FakeYDL() as ydl: + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_multiple_encodings(self, handler): + with handler() as rh: for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): - res = ydl.urlopen( - sanitized_Request( + res = validate_and_send( + rh, Request( f'http://127.0.0.1:{self.http_port}/content-encoding', headers={'ytdl-encoding': pair})) - self.assertEqual(res.headers.get('Content-Encoding'), pair) - self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') + assert res.headers.get('Content-Encoding') == pair + assert res.read() == b'<html><video src="/vid.mp4" /></html>' - def test_unsupported_encoding(self): - # it should return the raw content - with FakeYDL() as ydl: - res = ydl.urlopen( - sanitized_Request( + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_unsupported_encoding(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request( f'http://127.0.0.1:{self.http_port}/content-encoding', headers={'ytdl-encoding': 'unsupported'})) - self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported') - self.assertEqual(res.read(), b'raw') + assert res.headers.get('Content-Encoding') == 'unsupported' + assert res.read() == b'raw' + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_read(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers')) + assert res.readable() + assert res.read(1) == b'H' + assert res.read(3) == b'ost' -class TestClientCert(unittest.TestCase): - def setUp(self): +class TestHTTPProxy(TestRequestHandlerBase): + @classmethod + def setup_class(cls): + super().setup_class() + # HTTP Proxy server + cls.proxy = http.server.ThreadingHTTPServer( + ('127.0.0.1', 0), _build_proxy_handler('normal')) + cls.proxy_port = http_server_port(cls.proxy) + cls.proxy_thread = threading.Thread(target=cls.proxy.serve_forever) + cls.proxy_thread.daemon = True + cls.proxy_thread.start() + + # Geo proxy server + cls.geo_proxy = http.server.ThreadingHTTPServer( + ('127.0.0.1', 0), _build_proxy_handler('geo')) + cls.geo_port = http_server_port(cls.geo_proxy) + cls.geo_proxy_thread = threading.Thread(target=cls.geo_proxy.serve_forever) + cls.geo_proxy_thread.daemon = True + cls.geo_proxy_thread.start() + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_http_proxy(self, handler): + http_proxy = f'http://127.0.0.1:{self.proxy_port}' + geo_proxy = f'http://127.0.0.1:{self.geo_port}' + + # Test global http proxy + # Test per request http proxy + # Test per request http proxy disables proxy + url = 'http://foo.com/bar' + + # Global HTTP proxy + with handler(proxies={'http': http_proxy}) as rh: + res = validate_and_send(rh, Request(url)).read().decode() + assert res == f'normal: {url}' + + # Per request proxy overrides global + res = validate_and_send(rh, Request(url, proxies={'http': geo_proxy})).read().decode() + assert res == f'geo: {url}' + + # and setting to None disables all proxies for that request + real_url = f'http://127.0.0.1:{self.http_port}/headers' + res = validate_and_send( + rh, Request(real_url, proxies={'http': None})).read().decode() + assert res != f'normal: {real_url}' + assert 'Accept' in res + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_noproxy(self, handler): + with handler(proxies={'proxy': f'http://127.0.0.1:{self.proxy_port}'}) as rh: + # NO_PROXY + for no_proxy in (f'127.0.0.1:{self.http_port}', '127.0.0.1', 'localhost'): + nop_response = validate_and_send( + rh, Request(f'http://127.0.0.1:{self.http_port}/headers', proxies={'no': no_proxy})).read().decode( + 'utf-8') + assert 'Accept' in nop_response + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_allproxy(self, handler): + url = 'http://foo.com/bar' + with handler() as rh: + response = validate_and_send(rh, Request(url, proxies={'all': f'http://127.0.0.1:{self.proxy_port}'})).read().decode( + 'utf-8') + assert response == f'normal: {url}' + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_http_proxy_with_idn(self, handler): + with handler(proxies={ + 'http': f'http://127.0.0.1:{self.proxy_port}', + }) as rh: + url = 'http://中文.tw/' + response = rh.send(Request(url)).read().decode() + # b'xn--fiq228c' is '中文'.encode('idna') + assert response == 'normal: http://xn--fiq228c.tw/' + + +class TestClientCertificate: + + @classmethod + def setup_class(cls): certfn = os.path.join(TEST_DIR, 'testcert.pem') - self.certdir = os.path.join(TEST_DIR, 'testdata', 'certificate') - cacertfn = os.path.join(self.certdir, 'ca.crt') - self.httpd = http.server.HTTPServer(('127.0.0.1', 0), HTTPTestRequestHandler) + cls.certdir = os.path.join(TEST_DIR, 'testdata', 'certificate') + cacertfn = os.path.join(cls.certdir, 'ca.crt') + cls.httpd = http.server.ThreadingHTTPServer(('127.0.0.1', 0), HTTPTestRequestHandler) sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) sslctx.verify_mode = ssl.CERT_REQUIRED sslctx.load_verify_locations(cafile=cacertfn) sslctx.load_cert_chain(certfn, None) - self.httpd.socket = sslctx.wrap_socket(self.httpd.socket, server_side=True) - self.port = http_server_port(self.httpd) - self.server_thread = threading.Thread(target=self.httpd.serve_forever) - self.server_thread.daemon = True - self.server_thread.start() + cls.httpd.socket = sslctx.wrap_socket(cls.httpd.socket, server_side=True) + cls.port = http_server_port(cls.httpd) + cls.server_thread = threading.Thread(target=cls.httpd.serve_forever) + cls.server_thread.daemon = True + cls.server_thread.start() - def _run_test(self, **params): - ydl = YoutubeDL({ - 'logger': FakeLogger(), + def _run_test(self, handler, **handler_kwargs): + with handler( # Disable client-side validation of unacceptable self-signed testcert.pem # The test is of a check on the server side, so unaffected - 'nocheckcertificate': True, - **params, + verify=False, + **handler_kwargs, + ) as rh: + validate_and_send(rh, Request(f'https://127.0.0.1:{self.port}/video.html')).read().decode() + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_certificate_combined_nopass(self, handler): + self._run_test(handler, client_cert={ + 'client_certificate': os.path.join(self.certdir, 'clientwithkey.crt'), }) - r = ydl.extract_info(f'https://127.0.0.1:{self.port}/video.html') - self.assertEqual(r['url'], f'https://127.0.0.1:{self.port}/vid.mp4') - def test_certificate_combined_nopass(self): - self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt')) - - def test_certificate_nocombined_nopass(self): - self._run_test(client_certificate=os.path.join(self.certdir, 'client.crt'), - client_certificate_key=os.path.join(self.certdir, 'client.key')) - - def test_certificate_combined_pass(self): - self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithencryptedkey.crt'), - client_certificate_password='foobar') - - def test_certificate_nocombined_pass(self): - self._run_test(client_certificate=os.path.join(self.certdir, 'client.crt'), - client_certificate_key=os.path.join(self.certdir, 'clientencrypted.key'), - client_certificate_password='foobar') - - -def _build_proxy_handler(name): - class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): - proxy_name = name - - def log_message(self, format, *args): - pass - - def do_GET(self): - self.send_response(200) - self.send_header('Content-Type', 'text/plain; charset=utf-8') - self.end_headers() - self.wfile.write(f'{self.proxy_name}: {self.path}'.encode()) - return HTTPTestRequestHandler - - -class TestProxy(unittest.TestCase): - def setUp(self): - self.proxy = http.server.HTTPServer( - ('127.0.0.1', 0), _build_proxy_handler('normal')) - self.port = http_server_port(self.proxy) - self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) - self.proxy_thread.daemon = True - self.proxy_thread.start() - - self.geo_proxy = http.server.HTTPServer( - ('127.0.0.1', 0), _build_proxy_handler('geo')) - self.geo_port = http_server_port(self.geo_proxy) - self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever) - self.geo_proxy_thread.daemon = True - self.geo_proxy_thread.start() - - def test_proxy(self): - geo_proxy = f'127.0.0.1:{self.geo_port}' - ydl = YoutubeDL({ - 'proxy': f'127.0.0.1:{self.port}', - 'geo_verification_proxy': geo_proxy, + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_certificate_nocombined_nopass(self, handler): + self._run_test(handler, client_cert={ + 'client_certificate': os.path.join(self.certdir, 'client.crt'), + 'client_certificate_key': os.path.join(self.certdir, 'client.key'), }) - url = 'http://foo.com/bar' - response = ydl.urlopen(url).read().decode() - self.assertEqual(response, f'normal: {url}') - req = urllib.request.Request(url) - req.add_header('Ytdl-request-proxy', geo_proxy) - response = ydl.urlopen(req).read().decode() - self.assertEqual(response, f'geo: {url}') - - def test_proxy_with_idn(self): - ydl = YoutubeDL({ - 'proxy': f'127.0.0.1:{self.port}', + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_certificate_combined_pass(self, handler): + self._run_test(handler, client_cert={ + 'client_certificate': os.path.join(self.certdir, 'clientwithencryptedkey.crt'), + 'client_certificate_password': 'foobar', + }) + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_certificate_nocombined_pass(self, handler): + self._run_test(handler, client_cert={ + 'client_certificate': os.path.join(self.certdir, 'client.crt'), + 'client_certificate_key': os.path.join(self.certdir, 'clientencrypted.key'), + 'client_certificate_password': 'foobar', }) - url = 'http://中文.tw/' - response = ydl.urlopen(url).read().decode() - # b'xn--fiq228c' is '中文'.encode('idna') - self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') -class TestFileURL(unittest.TestCase): - # See https://github.com/ytdl-org/youtube-dl/issues/8227 - def test_file_urls(self): +class TestUrllibRequestHandler(TestRequestHandlerBase): + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_file_urls(self, handler): + # See https://github.com/ytdl-org/youtube-dl/issues/8227 tf = tempfile.NamedTemporaryFile(delete=False) tf.write(b'foobar') tf.close() - url = pathlib.Path(tf.name).as_uri() - with FakeYDL() as ydl: - self.assertRaisesRegex( - urllib.error.URLError, 'file:// URLs are explicitly disabled in yt-dlp for security reasons', ydl.urlopen, url) - with FakeYDL({'enable_file_urls': True}) as ydl: - res = ydl.urlopen(url) - self.assertEqual(res.read(), b'foobar') + req = Request(pathlib.Path(tf.name).as_uri()) + with handler() as rh: + with pytest.raises(UnsupportedRequest): + rh.validate(req) + + # Test that urllib never loaded FileHandler + with pytest.raises(TransportError): + rh.send(req) + + with handler(enable_file_urls=True) as rh: + res = validate_and_send(rh, req) + assert res.read() == b'foobar' res.close() + os.unlink(tf.name) + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_http_error_returns_content(self, handler): + # urllib HTTPError will try close the underlying response if reference to the HTTPError object is lost + def get_response(): + with handler() as rh: + # headers url + try: + validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/gen_404')) + except HTTPError as e: + return e.response -if __name__ == '__main__': - unittest.main() + assert get_response().read() == b'<html></html>' + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_verify_cert_error_text(self, handler): + # Check the output of the error message + with handler() as rh: + with pytest.raises( + CertificateVerifyError, + match=r'\[SSL: CERTIFICATE_VERIFY_FAILED\] certificate verify failed: self.signed certificate' + ): + validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers')) + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_httplib_validation_errors(self, handler): + with handler() as rh: + + # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 + with pytest.raises(RequestError, match='method can\'t contain control characters') as exc_info: + validate_and_send(rh, Request('http://127.0.0.1', method='GET\n')) + assert not isinstance(exc_info.value, TransportError) + + # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1265 + with pytest.raises(RequestError, match='URL can\'t contain control characters') as exc_info: + validate_and_send(rh, Request('http://127.0.0. 1', method='GET\n')) + assert not isinstance(exc_info.value, TransportError) + + # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1288C31-L1288C50 + with pytest.raises(RequestError, match='Invalid header name') as exc_info: + validate_and_send(rh, Request('http://127.0.0.1', headers={'foo\n': 'bar'})) + assert not isinstance(exc_info.value, TransportError) + + +def run_validation(handler, fail, req, **handler_kwargs): + with handler(**handler_kwargs) as rh: + if fail: + with pytest.raises(UnsupportedRequest): + rh.validate(req) + else: + rh.validate(req) + + +class TestRequestHandlerValidation: + + class ValidationRH(RequestHandler): + def _send(self, request): + raise RequestError('test') + + class NoCheckRH(ValidationRH): + _SUPPORTED_FEATURES = None + _SUPPORTED_PROXY_SCHEMES = None + _SUPPORTED_URL_SCHEMES = None + + class HTTPSupportedRH(ValidationRH): + _SUPPORTED_URL_SCHEMES = ('http',) + + URL_SCHEME_TESTS = [ + # scheme, expected to fail, handler kwargs + ('Urllib', [ + ('http', False, {}), + ('https', False, {}), + ('data', False, {}), + ('ftp', False, {}), + ('file', True, {}), + ('file', False, {'enable_file_urls': True}), + ]), + (NoCheckRH, [('http', False, {})]), + (ValidationRH, [('http', True, {})]) + ] + + PROXY_SCHEME_TESTS = [ + # scheme, expected to fail + ('Urllib', [ + ('http', False), + ('https', True), + ('socks4', False), + ('socks4a', False), + ('socks5', False), + ('socks5h', False), + ('socks', True), + ]), + (NoCheckRH, [('http', False)]), + (HTTPSupportedRH, [('http', True)]), + ] + + PROXY_KEY_TESTS = [ + # key, expected to fail + ('Urllib', [ + ('all', False), + ('unrelated', False), + ]), + (NoCheckRH, [('all', False)]), + (HTTPSupportedRH, [('all', True)]), + (HTTPSupportedRH, [('no', True)]), + ] + + @pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [ + (handler_tests[0], scheme, fail, handler_kwargs) + for handler_tests in URL_SCHEME_TESTS + for scheme, fail, handler_kwargs in handler_tests[1] + + ], indirect=['handler']) + def test_url_scheme(self, handler, scheme, fail, handler_kwargs): + run_validation(handler, fail, Request(f'{scheme}://'), **(handler_kwargs or {})) + + @pytest.mark.parametrize('handler,fail', [('Urllib', False)], indirect=['handler']) + def test_no_proxy(self, handler, fail): + run_validation(handler, fail, Request('http://', proxies={'no': '127.0.0.1,github.com'})) + run_validation(handler, fail, Request('http://'), proxies={'no': '127.0.0.1,github.com'}) + + @pytest.mark.parametrize('handler,proxy_key,fail', [ + (handler_tests[0], proxy_key, fail) + for handler_tests in PROXY_KEY_TESTS + for proxy_key, fail in handler_tests[1] + ], indirect=['handler']) + def test_proxy_key(self, handler, proxy_key, fail): + run_validation(handler, fail, Request('http://', proxies={proxy_key: 'http://example.com'})) + run_validation(handler, fail, Request('http://'), proxies={proxy_key: 'http://example.com'}) + + @pytest.mark.parametrize('handler,scheme,fail', [ + (handler_tests[0], scheme, fail) + for handler_tests in PROXY_SCHEME_TESTS + for scheme, fail in handler_tests[1] + ], indirect=['handler']) + def test_proxy_scheme(self, handler, scheme, fail): + run_validation(handler, fail, Request('http://', proxies={'http': f'{scheme}://example.com'})) + run_validation(handler, fail, Request('http://'), proxies={'http': f'{scheme}://example.com'}) + + @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH], indirect=True) + def test_empty_proxy(self, handler): + run_validation(handler, False, Request('http://', proxies={'http': None})) + run_validation(handler, False, Request('http://'), proxies={'http': None}) + + @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1']) + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_missing_proxy_scheme(self, handler, proxy_url): + run_validation(handler, True, Request('http://', proxies={'http': 'example.com'})) + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_cookiejar_extension(self, handler): + run_validation(handler, True, Request('http://', extensions={'cookiejar': 'notacookiejar'})) + + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_timeout_extension(self, handler): + run_validation(handler, True, Request('http://', extensions={'timeout': 'notavalidtimeout'})) + + def test_invalid_request_type(self): + rh = self.ValidationRH(logger=FakeLogger()) + for method in (rh.validate, rh.send): + with pytest.raises(TypeError, match='Expected an instance of Request'): + method('not a request') + + +class FakeResponse(Response): + def __init__(self, request): + # XXX: we could make request part of standard response interface + self.request = request + super().__init__(fp=io.BytesIO(b''), headers={}, url=request.url) + + +class FakeRH(RequestHandler): + + def _validate(self, request): + return + + def _send(self, request: Request): + if request.url.startswith('ssl://'): + raise SSLError(request.url[len('ssl://'):]) + return FakeResponse(request) + + +class FakeRHYDL(FakeYDL): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._request_director = self.build_request_director([FakeRH]) + + +class TestRequestDirector: + + def test_handler_operations(self): + director = RequestDirector(logger=FakeLogger()) + handler = FakeRH(logger=FakeLogger()) + director.add_handler(handler) + assert director.handlers.get(FakeRH.RH_KEY) is handler + + # Handler should overwrite + handler2 = FakeRH(logger=FakeLogger()) + director.add_handler(handler2) + assert director.handlers.get(FakeRH.RH_KEY) is not handler + assert director.handlers.get(FakeRH.RH_KEY) is handler2 + assert len(director.handlers) == 1 + + class AnotherFakeRH(FakeRH): + pass + director.add_handler(AnotherFakeRH(logger=FakeLogger())) + assert len(director.handlers) == 2 + assert director.handlers.get(AnotherFakeRH.RH_KEY).RH_KEY == AnotherFakeRH.RH_KEY + + director.handlers.pop(FakeRH.RH_KEY, None) + assert director.handlers.get(FakeRH.RH_KEY) is None + assert len(director.handlers) == 1 + + # RequestErrors should passthrough + with pytest.raises(SSLError): + director.send(Request('ssl://something')) + + def test_send(self): + director = RequestDirector(logger=FakeLogger()) + with pytest.raises(RequestError): + director.send(Request('any://')) + director.add_handler(FakeRH(logger=FakeLogger())) + assert isinstance(director.send(Request('http://')), FakeResponse) + + def test_unsupported_handlers(self): + director = RequestDirector(logger=FakeLogger()) + director.add_handler(FakeRH(logger=FakeLogger())) + + class SupportedRH(RequestHandler): + _SUPPORTED_URL_SCHEMES = ['http'] + + def _send(self, request: Request): + return Response(fp=io.BytesIO(b'supported'), headers={}, url=request.url) + + # This handler should by default take preference over FakeRH + director.add_handler(SupportedRH(logger=FakeLogger())) + assert director.send(Request('http://')).read() == b'supported' + assert director.send(Request('any://')).read() == b'' + + director.handlers.pop(FakeRH.RH_KEY) + with pytest.raises(NoSupportingHandlers): + director.send(Request('any://')) + + def test_unexpected_error(self): + director = RequestDirector(logger=FakeLogger()) + + class UnexpectedRH(FakeRH): + def _send(self, request: Request): + raise TypeError('something') + + director.add_handler(UnexpectedRH(logger=FakeLogger)) + with pytest.raises(NoSupportingHandlers, match=r'1 unexpected error'): + director.send(Request('any://')) + + director.handlers.clear() + assert len(director.handlers) == 0 + + # Should not be fatal + director.add_handler(FakeRH(logger=FakeLogger())) + director.add_handler(UnexpectedRH(logger=FakeLogger)) + assert director.send(Request('any://')) + + +# XXX: do we want to move this to test_YoutubeDL.py? +class TestYoutubeDLNetworking: + + @staticmethod + def build_handler(ydl, handler: RequestHandler = FakeRH): + return ydl.build_request_director([handler]).handlers.get(handler.RH_KEY) + + def test_compat_opener(self): + with FakeYDL() as ydl: + with warnings.catch_warnings(): + warnings.simplefilter('ignore', category=DeprecationWarning) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) + + @pytest.mark.parametrize('proxy,expected', [ + ('http://127.0.0.1:8080', {'all': 'http://127.0.0.1:8080'}), + ('', {'all': '__noproxy__'}), + (None, {'http': 'http://127.0.0.1:8081', 'https': 'http://127.0.0.1:8081'}) # env, set https + ]) + def test_proxy(self, proxy, expected): + old_http_proxy = os.environ.get('HTTP_PROXY') + try: + os.environ['HTTP_PROXY'] = 'http://127.0.0.1:8081' # ensure that provided proxies override env + with FakeYDL({'proxy': proxy}) as ydl: + assert ydl.proxies == expected + finally: + if old_http_proxy: + os.environ['HTTP_PROXY'] = old_http_proxy + + def test_compat_request(self): + with FakeRHYDL() as ydl: + assert ydl.urlopen('test://') + urllib_req = urllib.request.Request('http://foo.bar', data=b'test', method='PUT', headers={'X-Test': '1'}) + urllib_req.add_unredirected_header('Cookie', 'bob=bob') + urllib_req.timeout = 2 + + req = ydl.urlopen(urllib_req).request + assert req.url == urllib_req.get_full_url() + assert req.data == urllib_req.data + assert req.method == urllib_req.get_method() + assert 'X-Test' in req.headers + assert 'Cookie' in req.headers + assert req.extensions.get('timeout') == 2 + + with pytest.raises(AssertionError): + ydl.urlopen(None) + + def test_extract_basic_auth(self): + with FakeRHYDL() as ydl: + res = ydl.urlopen(Request('http://user:pass@foo.bar')) + assert res.request.headers['Authorization'] == 'Basic dXNlcjpwYXNz' + + def test_sanitize_url(self): + with FakeRHYDL() as ydl: + res = ydl.urlopen(Request('httpss://foo.bar')) + assert res.request.url == 'https://foo.bar' + + def test_file_urls_error(self): + # use urllib handler + with FakeYDL() as ydl: + with pytest.raises(RequestError, match=r'file:// URLs are disabled by default'): + ydl.urlopen('file://') + + def test_legacy_server_connect_error(self): + with FakeRHYDL() as ydl: + for error in ('UNSAFE_LEGACY_RENEGOTIATION_DISABLED', 'SSLV3_ALERT_HANDSHAKE_FAILURE'): + with pytest.raises(RequestError, match=r'Try using --legacy-server-connect'): + ydl.urlopen(f'ssl://{error}') + + with pytest.raises(SSLError, match='testerror'): + ydl.urlopen('ssl://testerror') + + @pytest.mark.parametrize('proxy_key,proxy_url,expected', [ + ('http', '__noproxy__', None), + ('no', '127.0.0.1,foo.bar', '127.0.0.1,foo.bar'), + ('https', 'example.com', 'http://example.com'), + ('https', 'socks5://example.com', 'socks5h://example.com'), + ('http', 'socks://example.com', 'socks4://example.com'), + ('http', 'socks4://example.com', 'socks4://example.com'), + ]) + def test_clean_proxy(self, proxy_key, proxy_url, expected): + # proxies should be cleaned in urlopen() + with FakeRHYDL() as ydl: + req = ydl.urlopen(Request('test://', proxies={proxy_key: proxy_url})).request + assert req.proxies[proxy_key] == expected + + # and should also be cleaned when building the handler + env_key = f'{proxy_key.upper()}_PROXY' + old_env_proxy = os.environ.get(env_key) + try: + os.environ[env_key] = proxy_url # ensure that provided proxies override env + with FakeYDL() as ydl: + rh = self.build_handler(ydl) + assert rh.proxies[proxy_key] == expected + finally: + if old_env_proxy: + os.environ[env_key] = old_env_proxy + + def test_clean_proxy_header(self): + with FakeRHYDL() as ydl: + req = ydl.urlopen(Request('test://', headers={'ytdl-request-proxy': '//foo.bar'})).request + assert 'ytdl-request-proxy' not in req.headers + assert req.proxies == {'all': 'http://foo.bar'} + + with FakeYDL({'http_headers': {'ytdl-request-proxy': '//foo.bar'}}) as ydl: + rh = self.build_handler(ydl) + assert 'ytdl-request-proxy' not in rh.headers + assert rh.proxies == {'all': 'http://foo.bar'} + + def test_clean_header(self): + with FakeRHYDL() as ydl: + res = ydl.urlopen(Request('test://', headers={'Youtubedl-no-compression': True})) + assert 'Youtubedl-no-compression' not in res.request.headers + assert res.request.headers.get('Accept-Encoding') == 'identity' + + with FakeYDL({'http_headers': {'Youtubedl-no-compression': True}}) as ydl: + rh = self.build_handler(ydl) + assert 'Youtubedl-no-compression' not in rh.headers + assert rh.headers.get('Accept-Encoding') == 'identity' + + def test_build_handler_params(self): + with FakeYDL({ + 'http_headers': {'test': 'testtest'}, + 'socket_timeout': 2, + 'proxy': 'http://127.0.0.1:8080', + 'source_address': '127.0.0.45', + 'debug_printtraffic': True, + 'compat_opts': ['no-certifi'], + 'nocheckcertificate': True, + 'legacy_server_connect': True, + }) as ydl: + rh = self.build_handler(ydl) + assert rh.headers.get('test') == 'testtest' + assert 'Accept' in rh.headers # ensure std_headers are still there + assert rh.timeout == 2 + assert rh.proxies.get('all') == 'http://127.0.0.1:8080' + assert rh.source_address == '127.0.0.45' + assert rh.verbose is True + assert rh.prefer_system_certs is True + assert rh.verify is False + assert rh.legacy_ssl_support is True + + @pytest.mark.parametrize('ydl_params', [ + {'client_certificate': 'fakecert.crt'}, + {'client_certificate': 'fakecert.crt', 'client_certificate_key': 'fakekey.key'}, + {'client_certificate': 'fakecert.crt', 'client_certificate_key': 'fakekey.key', 'client_certificate_password': 'foobar'}, + {'client_certificate_key': 'fakekey.key', 'client_certificate_password': 'foobar'}, + ]) + def test_client_certificate(self, ydl_params): + with FakeYDL(ydl_params) as ydl: + rh = self.build_handler(ydl) + assert rh._client_cert == ydl_params # XXX: Too bound to implementation + + def test_urllib_file_urls(self): + with FakeYDL({'enable_file_urls': False}) as ydl: + rh = self.build_handler(ydl, UrllibRH) + assert rh.enable_file_urls is False + + with FakeYDL({'enable_file_urls': True}) as ydl: + rh = self.build_handler(ydl, UrllibRH) + assert rh.enable_file_urls is True + + +class TestRequest: + + def test_query(self): + req = Request('http://example.com?q=something', query={'v': 'xyz'}) + assert req.url == 'http://example.com?q=something&v=xyz' + + req.update(query={'v': '123'}) + assert req.url == 'http://example.com?q=something&v=123' + req.update(url='http://example.com', query={'v': 'xyz'}) + assert req.url == 'http://example.com?v=xyz' + + def test_method(self): + req = Request('http://example.com') + assert req.method == 'GET' + req.data = b'test' + assert req.method == 'POST' + req.data = None + assert req.method == 'GET' + req.data = b'test2' + req.method = 'PUT' + assert req.method == 'PUT' + req.data = None + assert req.method == 'PUT' + with pytest.raises(TypeError): + req.method = 1 + + def test_request_helpers(self): + assert HEADRequest('http://example.com').method == 'HEAD' + assert PUTRequest('http://example.com').method == 'PUT' + + def test_headers(self): + req = Request('http://example.com', headers={'tesT': 'test'}) + assert req.headers == HTTPHeaderDict({'test': 'test'}) + req.update(headers={'teSt2': 'test2'}) + assert req.headers == HTTPHeaderDict({'test': 'test', 'test2': 'test2'}) + + req.headers = new_headers = HTTPHeaderDict({'test': 'test'}) + assert req.headers == HTTPHeaderDict({'test': 'test'}) + assert req.headers is new_headers + + # test converts dict to case insensitive dict + req.headers = new_headers = {'test2': 'test2'} + assert isinstance(req.headers, HTTPHeaderDict) + assert req.headers is not new_headers + + with pytest.raises(TypeError): + req.headers = None + + def test_data_type(self): + req = Request('http://example.com') + assert req.data is None + # test bytes is allowed + req.data = b'test' + assert req.data == b'test' + # test iterable of bytes is allowed + i = [b'test', b'test2'] + req.data = i + assert req.data == i + + # test file-like object is allowed + f = io.BytesIO(b'test') + req.data = f + assert req.data == f + + # common mistake: test str not allowed + with pytest.raises(TypeError): + req.data = 'test' + assert req.data != 'test' + + # common mistake: test dict is not allowed + with pytest.raises(TypeError): + req.data = {'test': 'test'} + assert req.data != {'test': 'test'} + + def test_content_length_header(self): + req = Request('http://example.com', headers={'Content-Length': '0'}, data=b'') + assert req.headers.get('Content-Length') == '0' + + req.data = b'test' + assert 'Content-Length' not in req.headers + + req = Request('http://example.com', headers={'Content-Length': '10'}) + assert 'Content-Length' not in req.headers + + def test_content_type_header(self): + req = Request('http://example.com', headers={'Content-Type': 'test'}, data=b'test') + assert req.headers.get('Content-Type') == 'test' + req.data = b'test2' + assert req.headers.get('Content-Type') == 'test' + req.data = None + assert 'Content-Type' not in req.headers + req.data = b'test3' + assert req.headers.get('Content-Type') == 'application/x-www-form-urlencoded' + + def test_proxies(self): + req = Request(url='http://example.com', proxies={'http': 'http://127.0.0.1:8080'}) + assert req.proxies == {'http': 'http://127.0.0.1:8080'} + + def test_extensions(self): + req = Request(url='http://example.com', extensions={'timeout': 2}) + assert req.extensions == {'timeout': 2} + + def test_copy(self): + req = Request( + url='http://example.com', + extensions={'cookiejar': CookieJar()}, + headers={'Accept-Encoding': 'br'}, + proxies={'http': 'http://127.0.0.1'}, + data=[b'123'] + ) + req_copy = req.copy() + assert req_copy is not req + assert req_copy.url == req.url + assert req_copy.headers == req.headers + assert req_copy.headers is not req.headers + assert req_copy.proxies == req.proxies + assert req_copy.proxies is not req.proxies + + # Data is not able to be copied + assert req_copy.data == req.data + assert req_copy.data is req.data + + # Shallow copy extensions + assert req_copy.extensions is not req.extensions + assert req_copy.extensions['cookiejar'] == req.extensions['cookiejar'] + + # Subclasses are copied by default + class AnotherRequest(Request): + pass + + req = AnotherRequest(url='http://127.0.0.1') + assert isinstance(req.copy(), AnotherRequest) + + def test_url(self): + req = Request(url='https://фtest.example.com/ some spaceв?ä=c',) + assert req.url == 'https://xn--test-z6d.example.com/%20some%20space%D0%B2?%C3%A4=c' + + assert Request(url='//example.com').url == 'http://example.com' + + with pytest.raises(TypeError): + Request(url='https://').url = None + + +class TestResponse: + + @pytest.mark.parametrize('reason,status,expected', [ + ('custom', 200, 'custom'), + (None, 404, 'Not Found'), # fallback status + ('', 403, 'Forbidden'), + (None, 999, None) + ]) + def test_reason(self, reason, status, expected): + res = Response(io.BytesIO(b''), url='test://', headers={}, status=status, reason=reason) + assert res.reason == expected + + def test_headers(self): + headers = Message() + headers.add_header('Test', 'test') + headers.add_header('Test', 'test2') + headers.add_header('content-encoding', 'br') + res = Response(io.BytesIO(b''), headers=headers, url='test://') + assert res.headers.get_all('test') == ['test', 'test2'] + assert 'Content-Encoding' in res.headers + + def test_get_header(self): + headers = Message() + headers.add_header('Set-Cookie', 'cookie1') + headers.add_header('Set-cookie', 'cookie2') + headers.add_header('Test', 'test') + headers.add_header('Test', 'test2') + res = Response(io.BytesIO(b''), headers=headers, url='test://') + assert res.get_header('test') == 'test, test2' + assert res.get_header('set-Cookie') == 'cookie1' + assert res.get_header('notexist', 'default') == 'default' + + def test_compat(self): + res = Response(io.BytesIO(b''), url='test://', status=404, headers={'test': 'test'}) + assert res.code == res.getcode() == res.status + assert res.geturl() == res.url + assert res.info() is res.headers + assert res.getheader('test') == res.get_header('test') diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py new file mode 100644 index 0000000000..f9f876af3d --- /dev/null +++ b/test/test_networking_utils.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import io +import platform +import random +import ssl +import urllib.error + +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.dependencies import certifi +from yt_dlp.networking import Response +from yt_dlp.networking._helper import ( + InstanceStoreMixin, + add_accept_encoding_header, + get_redirect_method, + make_socks_proxy_opts, + select_proxy, + ssl_load_certs, +) +from yt_dlp.networking.exceptions import ( + HTTPError, + IncompleteRead, + _CompatHTTPError, +) +from yt_dlp.socks import ProxyType +from yt_dlp.utils.networking import HTTPHeaderDict + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class TestNetworkingUtils: + + def test_select_proxy(self): + proxies = { + 'all': 'socks5://example.com', + 'http': 'http://example.com:1080', + 'no': 'bypass.example.com,yt-dl.org' + } + + assert select_proxy('https://example.com', proxies) == proxies['all'] + assert select_proxy('http://example.com', proxies) == proxies['http'] + assert select_proxy('http://bypass.example.com', proxies) is None + assert select_proxy('https://yt-dl.org', proxies) is None + + @pytest.mark.parametrize('socks_proxy,expected', [ + ('socks5h://example.com', { + 'proxytype': ProxyType.SOCKS5, + 'addr': 'example.com', + 'port': 1080, + 'rdns': True, + 'username': None, + 'password': None + }), + ('socks5://user:@example.com:5555', { + 'proxytype': ProxyType.SOCKS5, + 'addr': 'example.com', + 'port': 5555, + 'rdns': False, + 'username': 'user', + 'password': '' + }), + ('socks4://u%40ser:pa%20ss@127.0.0.1:1080', { + 'proxytype': ProxyType.SOCKS4, + 'addr': '127.0.0.1', + 'port': 1080, + 'rdns': False, + 'username': 'u@ser', + 'password': 'pa ss' + }), + ('socks4a://:pa%20ss@127.0.0.1', { + 'proxytype': ProxyType.SOCKS4A, + 'addr': '127.0.0.1', + 'port': 1080, + 'rdns': True, + 'username': '', + 'password': 'pa ss' + }) + ]) + def test_make_socks_proxy_opts(self, socks_proxy, expected): + assert make_socks_proxy_opts(socks_proxy) == expected + + def test_make_socks_proxy_unknown(self): + with pytest.raises(ValueError, match='Unknown SOCKS proxy version: socks'): + make_socks_proxy_opts('socks://127.0.0.1') + + @pytest.mark.skipif(not certifi, reason='certifi is not installed') + def test_load_certifi(self): + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context2 = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_load_certs(context, use_certifi=True) + context2.load_verify_locations(cafile=certifi.where()) + assert context.get_ca_certs() == context2.get_ca_certs() + + # Test load normal certs + # XXX: could there be a case where system certs are the same as certifi? + context3 = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_load_certs(context3, use_certifi=False) + assert context3.get_ca_certs() != context.get_ca_certs() + + @pytest.mark.parametrize('method,status,expected', [ + ('GET', 303, 'GET'), + ('HEAD', 303, 'HEAD'), + ('PUT', 303, 'GET'), + ('POST', 301, 'GET'), + ('HEAD', 301, 'HEAD'), + ('POST', 302, 'GET'), + ('HEAD', 302, 'HEAD'), + ('PUT', 302, 'PUT'), + ('POST', 308, 'POST'), + ('POST', 307, 'POST'), + ('HEAD', 308, 'HEAD'), + ('HEAD', 307, 'HEAD'), + ]) + def test_get_redirect_method(self, method, status, expected): + assert get_redirect_method(method, status) == expected + + @pytest.mark.parametrize('headers,supported_encodings,expected', [ + ({'Accept-Encoding': 'br'}, ['gzip', 'br'], {'Accept-Encoding': 'br'}), + ({}, ['gzip', 'br'], {'Accept-Encoding': 'gzip, br'}), + ({'Content-type': 'application/json'}, [], {'Content-type': 'application/json', 'Accept-Encoding': 'identity'}), + ]) + def test_add_accept_encoding_header(self, headers, supported_encodings, expected): + headers = HTTPHeaderDict(headers) + add_accept_encoding_header(headers, supported_encodings) + assert headers == HTTPHeaderDict(expected) + + +class TestInstanceStoreMixin: + + class FakeInstanceStoreMixin(InstanceStoreMixin): + def _create_instance(self, **kwargs): + return random.randint(0, 1000000) + + def _close_instance(self, instance): + pass + + def test_mixin(self): + mixin = self.FakeInstanceStoreMixin() + assert mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}}) == mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}}) + + assert mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'e', 4}}) != mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}}) + + assert mixin._get_instance(d={'a': 1, 'b': 2, 'c': {'d', 4}} != mixin._get_instance(d={'a': 1, 'b': 2, 'g': {'d', 4}})) + + assert mixin._get_instance(d={'a': 1}, e=[1, 2, 3]) == mixin._get_instance(d={'a': 1}, e=[1, 2, 3]) + + assert mixin._get_instance(d={'a': 1}, e=[1, 2, 3]) != mixin._get_instance(d={'a': 1}, e=[1, 2, 3, 4]) + + cookiejar = YoutubeDLCookieJar() + assert mixin._get_instance(b=[1, 2], c=cookiejar) == mixin._get_instance(b=[1, 2], c=cookiejar) + + assert mixin._get_instance(b=[1, 2], c=cookiejar) != mixin._get_instance(b=[1, 2], c=YoutubeDLCookieJar()) + + # Different order + assert mixin._get_instance(c=cookiejar, b=[1, 2]) == mixin._get_instance(b=[1, 2], c=cookiejar) + + m = mixin._get_instance(t=1234) + assert mixin._get_instance(t=1234) == m + mixin._clear_instances() + assert mixin._get_instance(t=1234) != m + + +class TestNetworkingExceptions: + + @staticmethod + def create_response(status): + return Response(fp=io.BytesIO(b'test'), url='http://example.com', headers={'tesT': 'test'}, status=status) + + @pytest.mark.parametrize('http_error_class', [HTTPError, lambda r: _CompatHTTPError(HTTPError(r))]) + def test_http_error(self, http_error_class): + + response = self.create_response(403) + error = http_error_class(response) + + assert error.status == 403 + assert str(error) == error.msg == 'HTTP Error 403: Forbidden' + assert error.reason == response.reason + assert error.response is response + + data = error.response.read() + assert data == b'test' + assert repr(error) == '<HTTPError 403: Forbidden>' + + @pytest.mark.parametrize('http_error_class', [HTTPError, lambda *args, **kwargs: _CompatHTTPError(HTTPError(*args, **kwargs))]) + def test_redirect_http_error(self, http_error_class): + response = self.create_response(301) + error = http_error_class(response, redirect_loop=True) + assert str(error) == error.msg == 'HTTP Error 301: Moved Permanently (redirect loop detected)' + assert error.reason == 'Moved Permanently' + + def test_compat_http_error(self): + response = self.create_response(403) + error = _CompatHTTPError(HTTPError(response)) + assert isinstance(error, HTTPError) + assert isinstance(error, urllib.error.HTTPError) + + assert error.code == 403 + assert error.getcode() == 403 + assert error.hdrs is error.response.headers + assert error.info() is error.response.headers + assert error.headers is error.response.headers + assert error.filename == error.response.url + assert error.url == error.response.url + assert error.geturl() == error.response.url + + # Passthrough file operations + assert error.read() == b'test' + assert not error.closed + # Technically Response operations are also passed through, which should not be used. + assert error.get_header('test') == 'test' + + @pytest.mark.skipif( + platform.python_implementation() == 'PyPy', reason='garbage collector works differently in pypy') + def test_compat_http_error_autoclose(self): + # Compat HTTPError should not autoclose response + response = self.create_response(403) + _CompatHTTPError(HTTPError(response)) + assert not response.closed + + def test_incomplete_read_error(self): + error = IncompleteRead(b'test', 3, cause='test') + assert isinstance(error, IncompleteRead) + assert repr(error) == '<IncompleteRead: 4 bytes read, 3 more expected>' + assert str(error) == error.msg == '4 bytes read, 3 more expected' + assert error.partial == b'test' + assert error.expected == 3 + assert error.cause == 'test' + + error = IncompleteRead(b'aaa') + assert repr(error) == '<IncompleteRead: 3 bytes read>' + assert str(error) == '3 bytes read' diff --git a/test/test_utils.py b/test/test_utils.py index 862c7d0f75..768edfd0cf 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -51,6 +51,7 @@ escape_url, expand_path, extract_attributes, + extract_basic_auth, find_xpath_attr, fix_xml_ampersands, float_or_none, @@ -103,7 +104,6 @@ sanitize_filename, sanitize_path, sanitize_url, - sanitized_Request, shell_quote, smuggle_url, str_or_none, @@ -132,6 +132,7 @@ xpath_text, xpath_with_ns, ) +from yt_dlp.utils.networking import HTTPHeaderDict class TestUtil(unittest.TestCase): @@ -2315,14 +2316,43 @@ def test_traverse_obj(self): self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'], msg='function on a `re.Match` should give group name as well') + def test_http_header_dict(self): + headers = HTTPHeaderDict() + headers['ytdl-test'] = 1 + self.assertEqual(list(headers.items()), [('Ytdl-Test', '1')]) + headers['Ytdl-test'] = '2' + self.assertEqual(list(headers.items()), [('Ytdl-Test', '2')]) + self.assertTrue('ytDl-Test' in headers) + self.assertEqual(str(headers), str(dict(headers))) + self.assertEqual(repr(headers), str(dict(headers))) + + headers.update({'X-dlp': 'data'}) + self.assertEqual(set(headers.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data')}) + self.assertEqual(dict(headers), {'Ytdl-Test': '2', 'X-Dlp': 'data'}) + self.assertEqual(len(headers), 2) + self.assertEqual(headers.copy(), headers) + headers2 = HTTPHeaderDict({'X-dlp': 'data3'}, **headers, **{'X-dlp': 'data2'}) + self.assertEqual(set(headers2.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data2')}) + self.assertEqual(len(headers2), 2) + headers2.clear() + self.assertEqual(len(headers2), 0) + + # ensure we prefer latter headers + headers3 = HTTPHeaderDict({'Ytdl-TeSt': 1}, {'Ytdl-test': 2}) + self.assertEqual(set(headers3.items()), {('Ytdl-Test', '2')}) + del headers3['ytdl-tesT'] + self.assertEqual(dict(headers3), {}) + + headers4 = HTTPHeaderDict({'ytdl-test': 'data;'}) + self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')}) + def test_extract_basic_auth(self): - auth_header = lambda url: sanitized_Request(url).get_header('Authorization') - self.assertFalse(auth_header('http://foo.bar')) - self.assertFalse(auth_header('http://:foo.bar')) - self.assertEqual(auth_header('http://@foo.bar'), 'Basic Og==') - self.assertEqual(auth_header('http://:pass@foo.bar'), 'Basic OnBhc3M=') - self.assertEqual(auth_header('http://user:@foo.bar'), 'Basic dXNlcjo=') - self.assertEqual(auth_header('http://user:pass@foo.bar'), 'Basic dXNlcjpwYXNz') + assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None) + assert extract_basic_auth('http://foo.bar') == ('http://foo.bar', None) + assert extract_basic_auth('http://@foo.bar') == ('http://foo.bar', 'Basic Og==') + assert extract_basic_auth('http://:pass@foo.bar') == ('http://foo.bar', 'Basic OnBhc3M=') + assert extract_basic_auth('http://user:@foo.bar') == ('http://foo.bar', 'Basic dXNlcjo=') + assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') if __name__ == '__main__': diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 138646ebfc..29a18aef02 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4,7 +4,6 @@ import datetime import errno import fileinput -import functools import http.cookiejar import io import itertools @@ -25,8 +24,8 @@ import unicodedata from .cache import Cache -from .compat import urllib # isort: split -from .compat import compat_os_name, compat_shlex_quote +from .compat import functools, urllib # isort: split +from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req from .cookies import LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version @@ -34,6 +33,15 @@ from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper from .minicurses import format_text +from .networking import Request, RequestDirector +from .networking.common import _REQUEST_HANDLERS +from .networking.exceptions import ( + HTTPError, + NoSupportingHandlers, + RequestError, + SSLError, + _CompatHTTPError, +) from .plugins import directories as plugin_directories from .postprocessor import _PLUGIN_CLASSES as plugin_pps from .postprocessor import ( @@ -78,7 +86,6 @@ MaxDownloadsReached, Namespace, PagedList, - PerRequestProxyHandler, PlaylistEntries, Popen, PostProcessingError, @@ -87,9 +94,6 @@ SameFileError, UnavailableVideoError, UserNotLive, - YoutubeDLCookieProcessor, - YoutubeDLHandler, - YoutubeDLRedirectHandler, age_restricted, args_to_str, bug_reports_message, @@ -102,6 +106,7 @@ error_to_compat_str, escapeHTML, expand_path, + extract_basic_auth, filter_dict, float_or_none, format_bytes, @@ -117,8 +122,6 @@ locked_file, make_archive_id, make_dir, - make_HTTPS_handler, - merge_headers, network_exceptions, number_of_digits, orderedSet, @@ -132,7 +135,6 @@ sanitize_filename, sanitize_path, sanitize_url, - sanitized_Request, std_headers, str_or_none, strftime_or_none, @@ -151,7 +153,12 @@ write_json_file, write_string, ) -from .utils.networking import clean_headers +from .utils._utils import _YDLLogger +from .utils.networking import ( + HTTPHeaderDict, + clean_headers, + clean_proxies, +) from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__ if compat_os_name == 'nt': @@ -673,7 +680,9 @@ def process_color_policy(stream): raise self.params['compat_opts'] = set(self.params.get('compat_opts', ())) - self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) + self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) + self._request_director = self.build_request_director( + sorted(_REQUEST_HANDLERS.values(), key=lambda rh: rh.RH_NAME.lower())) if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() @@ -763,8 +772,6 @@ def check_deprecated(param, option, suggestion): get_postprocessor(pp_def.pop('key'))(self, **pp_def), when=when) - self._setup_opener() - def preload_download_archive(fn): """Preload the archive, if any is specified""" archive = set() @@ -946,7 +953,11 @@ def save_cookies(self): def __exit__(self, *args): self.restore_console_title() + self.close() + + def close(self): self.save_cookies() + self._request_director.close() def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. @@ -2468,7 +2479,7 @@ def restore_last_token(self): return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): - res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) + res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers')) clean_headers(res) cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) if cookies: @@ -3943,13 +3954,8 @@ def get_encoding(stream): join_nonempty(*get_package_info(m)) for m in available_dependencies.values() })) or 'none')) - self._setup_opener() - proxy_map = {} - for handler in self._opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - write_debug(f'Proxy map: {proxy_map}') - + write_debug(f'Proxy map: {self.proxies}') + # write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers)}') for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['%s%s' % ( klass.__name__, '' if klass.__name__ == name else f' as {name}') @@ -3977,53 +3983,21 @@ def get_encoding(stream): 'See https://yt-dl.org/update if you need help updating.' % latest_version) - def _setup_opener(self): - if hasattr(self, '_opener'): - return - timeout_val = self.params.get('socket_timeout') - self._socket_timeout = 20 if timeout_val is None else float(timeout_val) + @functools.cached_property + def proxies(self): + """Global proxy configuration""" opts_proxy = self.params.get('proxy') - - cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': - proxies = {} - else: - proxies = {'http': opts_proxy, 'https': opts_proxy} + opts_proxy = '__noproxy__' + proxies = {'all': opts_proxy} else: proxies = urllib.request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) + # compat. Set HTTPS_PROXY to __noproxy__ to revert if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] - proxy_handler = PerRequestProxyHandler(proxies) - debuglevel = 1 if self.params.get('debug_printtraffic') else 0 - https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) - ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) - redirect_handler = YoutubeDLRedirectHandler() - data_handler = urllib.request.DataHandler() - - # When passing our own FileHandler instance, build_opener won't add the - # default FileHandler and allows us to disable the file protocol, which - # can be used for malicious purposes (see - # https://github.com/ytdl-org/youtube-dl/issues/8227) - file_handler = urllib.request.FileHandler() - - if not self.params.get('enable_file_urls'): - def file_open(*args, **kwargs): - raise urllib.error.URLError( - 'file:// URLs are explicitly disabled in yt-dlp for security reasons. ' - 'Use --enable-file-urls to enable at your own risk.') - file_handler.file_open = file_open - - opener = urllib.request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) - - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) - opener.addheaders = [] - self._opener = opener + return proxies @functools.cached_property def cookiejar(self): @@ -4031,11 +4005,84 @@ def cookiejar(self): return load_cookies( self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + @property + def _opener(self): + """ + Get a urllib OpenerDirector from the Urllib handler (deprecated). + """ + self.deprecation_warning('YoutubeDL._opener() is deprecated, use YoutubeDL.urlopen()') + handler = self._request_director.handlers['Urllib'] + return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies) + def urlopen(self, req): """ Start an HTTP download """ if isinstance(req, str): - req = sanitized_Request(req) - return self._opener.open(req, timeout=self._socket_timeout) + req = Request(req) + elif isinstance(req, urllib.request.Request): + req = urllib_req_to_req(req) + assert isinstance(req, Request) + + # compat: Assume user:pass url params are basic auth + url, basic_auth_header = extract_basic_auth(req.url) + if basic_auth_header: + req.headers['Authorization'] = basic_auth_header + req.url = sanitize_url(url) + + clean_proxies(proxies=req.proxies, headers=req.headers) + clean_headers(req.headers) + + try: + return self._request_director.send(req) + except NoSupportingHandlers as e: + for ue in e.unsupported_errors: + if not (ue.handler and ue.msg): + continue + if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower(): + raise RequestError( + 'file:// URLs are disabled by default in yt-dlp for security reasons. ' + 'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue + raise + except SSLError as e: + if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e): + raise RequestError('UNSAFE_LEGACY_RENEGOTIATION_DISABLED: Try using --legacy-server-connect', cause=e) from e + elif 'SSLV3_ALERT_HANDSHAKE_FAILURE' in str(e): + raise RequestError( + 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. ' + 'Try using --legacy-server-connect', cause=e) from e + raise + except HTTPError as e: # TODO: Remove in a future release + raise _CompatHTTPError(e) from e + + def build_request_director(self, handlers): + logger = _YDLLogger(self) + headers = self.params.get('http_headers').copy() + proxies = self.proxies.copy() + clean_headers(headers) + clean_proxies(proxies, headers) + + director = RequestDirector(logger=logger, verbose=self.params.get('debug_printtraffic')) + for handler in handlers: + director.add_handler(handler( + logger=logger, + headers=headers, + cookiejar=self.cookiejar, + proxies=proxies, + prefer_system_certs='no-certifi' in self.params['compat_opts'], + verify=not self.params.get('nocheckcertificate'), + **traverse_obj(self.params, { + 'verbose': 'debug_printtraffic', + 'source_address': 'source_address', + 'timeout': 'socket_timeout', + 'legacy_ssl_support': 'legacy_server_connect', + 'enable_file_urls': 'enable_file_urls', + 'client_cert': { + 'client_certificate': 'client_certificate', + 'client_certificate_key': 'client_certificate_key', + 'client_certificate_password': 'client_certificate_password', + }, + }), + )) + return director def encode(self, s): if isinstance(s, bytes): @@ -4188,7 +4235,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None else: self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: - uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {}))) + uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {}))) self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index c6c02541c2..a41a80ebb6 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -70,3 +70,13 @@ def compat_expanduser(path): return userhome + path[i:] else: compat_expanduser = os.path.expanduser + + +def urllib_req_to_req(urllib_request): + """Convert urllib Request to a networking Request""" + from ..networking import Request + from ..utils.networking import HTTPHeaderDict + return Request( + urllib_request.get_full_url(), data=urllib_request.data, method=urllib_request.get_method(), + headers=HTTPHeaderDict(urllib_request.headers, urllib_request.unredirected_hdrs), + extensions={'timeout': urllib_request.timeout} if hasattr(urllib_request, 'timeout') else None) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 7c5daea859..45d094721a 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -1,12 +1,10 @@ -import http.client import os import random -import socket -import ssl import time import urllib.error from .common import FileDownloader +from ..networking.exceptions import CertificateVerifyError, TransportError from ..utils import ( ContentTooShortError, RetryManager, @@ -21,14 +19,6 @@ write_xattr, ) -RESPONSE_READ_EXCEPTIONS = ( - TimeoutError, - socket.timeout, # compat: py < 3.10 - ConnectionError, - ssl.SSLError, - http.client.HTTPException -) - class HttpFD(FileDownloader): def real_download(self, filename, info_dict): @@ -196,13 +186,9 @@ def establish_connection(): # Unexpected HTTP error raise raise RetryDownload(err) - except urllib.error.URLError as err: - if isinstance(err.reason, ssl.CertificateError): - raise - raise RetryDownload(err) - # In urllib.request.AbstractHTTPHandler, the response is partially read on request. - # Any errors that occur during this will not be wrapped by URLError - except RESPONSE_READ_EXCEPTIONS as err: + except CertificateVerifyError: + raise + except TransportError as err: raise RetryDownload(err) def close_stream(): @@ -258,7 +244,7 @@ def retry(e): try: # Download and write data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - except RESPONSE_READ_EXCEPTIONS as err: + except TransportError as err: retry(err) byte_counter += len(data_block) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index fe08839aaa..63156d3ac9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -17,16 +17,22 @@ import sys import time import types -import urllib.error import urllib.parse import urllib.request import xml.etree.ElementTree from ..compat import functools # isort: split -from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media from ..downloader.hls import HlsFD +from ..networking.common import HEADRequest, Request +from ..networking.exceptions import network_exceptions from ..utils import ( IDENTITY, JSON_LD_RE, @@ -35,7 +41,6 @@ FormatSorter, GeoRestrictedError, GeoUtils, - HEADRequest, LenientJSONDecoder, Popen, RegexNotFoundError, @@ -61,7 +66,6 @@ js_to_json, mimetype2ext, netrc_from_content, - network_exceptions, orderedSet, parse_bitrate, parse_codecs, @@ -71,7 +75,6 @@ parse_resolution, sanitize_filename, sanitize_url, - sanitized_Request, smuggle_url, str_or_none, str_to_int, @@ -83,8 +86,6 @@ unescapeHTML, unified_strdate, unified_timestamp, - update_Request, - update_url_query, url_basename, url_or_none, urlhandle_detect_ext, @@ -797,10 +798,12 @@ def __can_accept_status_code(err, expected_status): def _create_request(self, url_or_request, data=None, headers=None, query=None): if isinstance(url_or_request, urllib.request.Request): - return update_Request(url_or_request, data=data, headers=headers, query=query) - if query: - url_or_request = update_url_query(url_or_request, query) - return sanitized_Request(url_or_request, data, headers or {}) + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) + + url_or_request.update(data=data, headers=headers, query=query) + return url_or_request def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): """ @@ -838,12 +841,7 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa except network_exceptions as err: if isinstance(err, urllib.error.HTTPError): if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of <https://bugs.python.org/issue15002> - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp + return err.response if errnote is False: return False diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index e69de29bb2..5e88764844 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -0,0 +1,13 @@ +# flake8: noqa: 401 +from .common import ( + HEADRequest, + PUTRequest, + Request, + RequestDirector, + RequestHandler, + Response, +) + +# isort: split +# TODO: all request handlers should be safely imported +from . import _urllib diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index 367f3f4447..a43c57bb4b 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -1,13 +1,22 @@ from __future__ import annotations import contextlib +import functools import ssl import sys +import typing import urllib.parse +import urllib.request +from .exceptions import RequestError, UnsupportedRequest from ..dependencies import certifi from ..socks import ProxyType -from ..utils import YoutubeDLError +from ..utils import format_field, traverse_obj + +if typing.TYPE_CHECKING: + from collections.abc import Iterable + + from ..utils.networking import HTTPHeaderDict def ssl_load_certs(context: ssl.SSLContext, use_certifi=True): @@ -23,11 +32,11 @@ def ssl_load_certs(context: ssl.SSLContext, use_certifi=True): # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): for storename in ('CA', 'ROOT'): - _ssl_load_windows_store_certs(context, storename) + ssl_load_windows_store_certs(context, storename) context.set_default_verify_paths() -def _ssl_load_windows_store_certs(ssl_context, storename): +def ssl_load_windows_store_certs(ssl_context, storename): # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py try: certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) @@ -44,10 +53,18 @@ def make_socks_proxy_opts(socks_proxy): url_components = urllib.parse.urlparse(socks_proxy) if url_components.scheme.lower() == 'socks5': socks_type = ProxyType.SOCKS5 - elif url_components.scheme.lower() in ('socks', 'socks4'): + rdns = False + elif url_components.scheme.lower() == 'socks5h': + socks_type = ProxyType.SOCKS5 + rdns = True + elif url_components.scheme.lower() == 'socks4': socks_type = ProxyType.SOCKS4 + rdns = False elif url_components.scheme.lower() == 'socks4a': socks_type = ProxyType.SOCKS4A + rdns = True + else: + raise ValueError(f'Unknown SOCKS proxy version: {url_components.scheme.lower()}') def unquote_if_non_empty(s): if not s: @@ -57,12 +74,25 @@ def unquote_if_non_empty(s): 'proxytype': socks_type, 'addr': url_components.hostname, 'port': url_components.port or 1080, - 'rdns': True, + 'rdns': rdns, 'username': unquote_if_non_empty(url_components.username), 'password': unquote_if_non_empty(url_components.password), } +def select_proxy(url, proxies): + """Unified proxy selector for all backends""" + url_components = urllib.parse.urlparse(url) + if 'no' in proxies: + hostport = url_components.hostname + format_field(url_components.port, None, ':%s') + if urllib.request.proxy_bypass_environment(hostport, {'no': proxies['no']}): + return + elif urllib.request.proxy_bypass(hostport): # check system settings + return + + return traverse_obj(proxies, url_components.scheme or 'http', 'all') + + def get_redirect_method(method, status): """Unified redirect method handling""" @@ -126,14 +156,53 @@ def make_ssl_context( client_certificate, keyfile=client_certificate_key, password=client_certificate_password) except ssl.SSLError: - raise YoutubeDLError('Unable to load client certificate') + raise RequestError('Unable to load client certificate') + if getattr(context, 'post_handshake_auth', None) is not None: + context.post_handshake_auth = True return context -def add_accept_encoding_header(headers, supported_encodings): - if supported_encodings and 'Accept-Encoding' not in headers: - headers['Accept-Encoding'] = ', '.join(supported_encodings) +class InstanceStoreMixin: + def __init__(self, **kwargs): + self.__instances = [] + super().__init__(**kwargs) # So that both MRO works - elif 'Accept-Encoding' not in headers: - headers['Accept-Encoding'] = 'identity' + @staticmethod + def _create_instance(**kwargs): + raise NotImplementedError + + def _get_instance(self, **kwargs): + for key, instance in self.__instances: + if key == kwargs: + return instance + + instance = self._create_instance(**kwargs) + self.__instances.append((kwargs, instance)) + return instance + + def _close_instance(self, instance): + if callable(getattr(instance, 'close', None)): + instance.close() + + def _clear_instances(self): + for _, instance in self.__instances: + self._close_instance(instance) + self.__instances.clear() + + +def add_accept_encoding_header(headers: HTTPHeaderDict, supported_encodings: Iterable[str]): + if 'Accept-Encoding' not in headers: + headers['Accept-Encoding'] = ', '.join(supported_encodings) or 'identity' + + +def wrap_request_errors(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) + except UnsupportedRequest as e: + if e.handler is None: + e.handler = self + raise + return wrapper diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 1f5871ae67..2c5f09872a 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import functools import gzip import http.client @@ -9,26 +11,48 @@ import urllib.request import urllib.response import zlib +from urllib.request import ( + DataHandler, + FileHandler, + FTPHandler, + HTTPCookieProcessor, + HTTPDefaultErrorHandler, + HTTPErrorProcessor, + UnknownHandler, +) from ._helper import ( + InstanceStoreMixin, add_accept_encoding_header, get_redirect_method, make_socks_proxy_opts, + select_proxy, +) +from .common import Features, RequestHandler, Response, register +from .exceptions import ( + CertificateVerifyError, + HTTPError, + IncompleteRead, + ProxyError, + RequestError, + SSLError, + TransportError, ) from ..dependencies import brotli +from ..socks import ProxyError as SocksProxyError from ..socks import sockssocket from ..utils import escape_url, update_url_query -from ..utils.networking import clean_headers, std_headers SUPPORTED_ENCODINGS = ['gzip', 'deflate'] +CONTENT_DECODE_ERRORS = [zlib.error, OSError] if brotli: SUPPORTED_ENCODINGS.append('br') + CONTENT_DECODE_ERRORS.append(brotli.error) -def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): +def _create_http_connection(http_class, source_address, *args, **kwargs): hc = http_class(*args, **kwargs) - source_address = ydl_handler._params.get('source_address') if source_address is not None: # This is to workaround _create_connection() from socket where it will try all @@ -73,7 +97,7 @@ def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_a return hc -class HTTPHandler(urllib.request.HTTPHandler): +class HTTPHandler(urllib.request.AbstractHTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -88,21 +112,30 @@ class HTTPHandler(urllib.request.HTTPHandler): public domain. """ - def __init__(self, params, *args, **kwargs): - urllib.request.HTTPHandler.__init__(self, *args, **kwargs) - self._params = params + def __init__(self, context=None, source_address=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self._source_address = source_address + self._context = context - def http_open(self, req): - conn_class = http.client.HTTPConnection - - socks_proxy = req.headers.get('Ytdl-socks-proxy') + @staticmethod + def _make_conn_class(base, req): + conn_class = base + socks_proxy = req.headers.pop('Ytdl-socks-proxy', None) if socks_proxy: conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] + return conn_class + def http_open(self, req): + conn_class = self._make_conn_class(http.client.HTTPConnection, req) return self.do_open(functools.partial( - _create_http_connection, self, conn_class, False), - req) + _create_http_connection, conn_class, self._source_address), req) + + def https_open(self, req): + conn_class = self._make_conn_class(http.client.HTTPSConnection, req) + return self.do_open( + functools.partial( + _create_http_connection, conn_class, self._source_address), + req, context=self._context) @staticmethod def deflate(data): @@ -152,14 +185,6 @@ def http_request(self, req): if url != url_escaped: req = update_Request(req, url=url_escaped) - for h, v in self._params.get('http_headers', std_headers).items(): - # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 - # The dict keys are capitalized because of this bug by urllib - if h.capitalize() not in req.headers: - req.add_header(h, v) - - clean_headers(req.headers) - add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS) return super().do_request_(req) def http_response(self, req, resp): @@ -207,16 +232,12 @@ class SocksConnection(base_class): def connect(self): self.sock = sockssocket() self.sock.setproxy(**proxy_args) - if isinstance(self.timeout, (int, float)): + if type(self.timeout) in (int, float): # noqa: E721 self.sock.settimeout(self.timeout) self.sock.connect((self.host, self.port)) if isinstance(self, http.client.HTTPSConnection): - if hasattr(self, '_context'): # Python > 2.6 - self.sock = self._context.wrap_socket( - self.sock, server_hostname=self.host) - else: - self.sock = ssl.wrap_socket(self.sock) + self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) return SocksConnection @@ -260,29 +281,25 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): unverifiable=True, method=new_method, data=new_data) -class ProxyHandler(urllib.request.ProxyHandler): +class ProxyHandler(urllib.request.BaseHandler): + handler_order = 100 + def __init__(self, proxies=None): + self.proxies = proxies # Set default handlers - for type in ('http', 'https'): - setattr(self, '%s_open' % type, - lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: - meth(r, proxy, type)) - urllib.request.ProxyHandler.__init__(self, proxies) + for type in ('http', 'https', 'ftp'): + setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r)) - def proxy_open(self, req, proxy, type): - req_proxy = req.headers.get('Ytdl-request-proxy') - if req_proxy is not None: - proxy = req_proxy - del req.headers['Ytdl-request-proxy'] - - if proxy == '__noproxy__': - return None # No Proxy - if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + def proxy_open(self, req): + proxy = select_proxy(req.get_full_url(), self.proxies) + if proxy is None: + return + if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'): req.add_header('Ytdl-socks-proxy', proxy) # yt-dlp's http/https handlers do wrapping the socket with socks return None return urllib.request.ProxyHandler.proxy_open( - self, req, proxy, type) + self, req, proxy, None) class PUTRequest(urllib.request.Request): @@ -313,3 +330,129 @@ def update_Request(req, url=None, data=None, headers=None, query=None): if hasattr(req, 'timeout'): new_req.timeout = req.timeout return new_req + + +class UrllibResponseAdapter(Response): + """ + HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse + """ + + def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl): + # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1] + # HTTPResponse: .getcode() was deprecated, .status always existed [2] + # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode + # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status + super().__init__( + fp=res, headers=res.headers, url=res.url, + status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None)) + + def read(self, amt=None): + try: + return self.fp.read(amt) + except Exception as e: + handle_response_read_exceptions(e) + raise e + + +def handle_sslerror(e: ssl.SSLError): + if not isinstance(e, ssl.SSLError): + return + if isinstance(e, ssl.SSLCertVerificationError): + raise CertificateVerifyError(cause=e) from e + raise SSLError(cause=e) from e + + +def handle_response_read_exceptions(e): + if isinstance(e, http.client.IncompleteRead): + raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e + elif isinstance(e, ssl.SSLError): + handle_sslerror(e) + elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)): + # OSErrors raised here should mostly be network related + raise TransportError(cause=e) from e + + +@register +class UrllibRH(RequestHandler, InstanceStoreMixin): + _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp') + _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h') + _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) + RH_NAME = 'urllib' + + def __init__(self, *, enable_file_urls: bool = False, **kwargs): + super().__init__(**kwargs) + self.enable_file_urls = enable_file_urls + if self.enable_file_urls: + self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file') + + def _create_instance(self, proxies, cookiejar): + opener = urllib.request.OpenerDirector() + handlers = [ + ProxyHandler(proxies), + HTTPHandler( + debuglevel=int(bool(self.verbose)), + context=self._make_sslcontext(), + source_address=self.source_address), + HTTPCookieProcessor(cookiejar), + DataHandler(), + UnknownHandler(), + HTTPDefaultErrorHandler(), + FTPHandler(), + HTTPErrorProcessor(), + RedirectHandler(), + ] + + if self.enable_file_urls: + handlers.append(FileHandler()) + + for handler in handlers: + opener.add_handler(handler) + + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) + opener.addheaders = [] + return opener + + def _send(self, request): + headers = self._merge_headers(request.headers) + add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + urllib_req = urllib.request.Request( + url=request.url, + data=request.data, + headers=dict(headers), + method=request.method + ) + + opener = self._get_instance( + proxies=request.proxies or self.proxies, + cookiejar=request.extensions.get('cookiejar') or self.cookiejar + ) + try: + res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout)) + except urllib.error.HTTPError as e: + if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): + # Prevent file object from being closed when urllib.error.HTTPError is destroyed. + e._closer.file = None + raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e + raise # unexpected + except urllib.error.URLError as e: + cause = e.reason # NOTE: cause may be a string + + # proxy errors + if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError): + raise ProxyError(cause=e) from e + + handle_response_read_exceptions(cause) + raise TransportError(cause=e) from e + except (http.client.InvalidURL, ValueError) as e: + # Validation errors + # http.client.HTTPConnection raises ValueError in some validation cases + # such as if request method contains illegal control characters [1] + # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 + raise RequestError(cause=e) from e + except Exception as e: + handle_response_read_exceptions(e) + raise # unexpected + + return UrllibResponseAdapter(res) diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py new file mode 100644 index 0000000000..e4b3628276 --- /dev/null +++ b/yt_dlp/networking/common.py @@ -0,0 +1,522 @@ +from __future__ import annotations + +import abc +import copy +import enum +import functools +import io +import typing +import urllib.parse +import urllib.request +import urllib.response +from collections.abc import Iterable, Mapping +from email.message import Message +from http import HTTPStatus +from http.cookiejar import CookieJar + +from ._helper import make_ssl_context, wrap_request_errors +from .exceptions import ( + NoSupportingHandlers, + RequestError, + TransportError, + UnsupportedRequest, +) +from ..utils import ( + bug_reports_message, + classproperty, + error_to_str, + escape_url, + update_url_query, +) +from ..utils.networking import HTTPHeaderDict + +if typing.TYPE_CHECKING: + RequestData = bytes | Iterable[bytes] | typing.IO | None + + +class RequestDirector: + """RequestDirector class + + Helper class that, when given a request, forward it to a RequestHandler that supports it. + + @param logger: Logger instance. + @param verbose: Print debug request information to stdout. + """ + + def __init__(self, logger, verbose=False): + self.handlers: dict[str, RequestHandler] = {} + self.logger = logger # TODO(Grub4k): default logger + self.verbose = verbose + + def close(self): + for handler in self.handlers.values(): + handler.close() + + def add_handler(self, handler: RequestHandler): + """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it""" + assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler' + self.handlers[handler.RH_KEY] = handler + + def _print_verbose(self, msg): + if self.verbose: + self.logger.stdout(f'director: {msg}') + + def send(self, request: Request) -> Response: + """ + Passes a request onto a suitable RequestHandler + """ + if not self.handlers: + raise RequestError('No request handlers configured') + + assert isinstance(request, Request) + + unexpected_errors = [] + unsupported_errors = [] + # TODO (future): add a per-request preference system + for handler in reversed(list(self.handlers.values())): + self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.') + try: + handler.validate(request) + except UnsupportedRequest as e: + self._print_verbose( + f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})') + unsupported_errors.append(e) + continue + + self._print_verbose(f'Sending request via "{handler.RH_NAME}"') + try: + response = handler.send(request) + except RequestError: + raise + except Exception as e: + self.logger.error( + f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}', + is_error=False) + unexpected_errors.append(e) + continue + + assert isinstance(response, Response) + return response + + raise NoSupportingHandlers(unsupported_errors, unexpected_errors) + + +_REQUEST_HANDLERS = {} + + +def register(handler): + """Register a RequestHandler class""" + assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler' + assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered' + _REQUEST_HANDLERS[handler.RH_KEY] = handler + return handler + + +class Features(enum.Enum): + ALL_PROXY = enum.auto() + NO_PROXY = enum.auto() + + +class RequestHandler(abc.ABC): + + """Request Handler class + + Request handlers are class that, given a Request, + process the request from start to finish and return a Response. + + Concrete subclasses need to redefine the _send(request) method, + which handles the underlying request logic and returns a Response. + + RH_NAME class variable may contain a display name for the RequestHandler. + By default, this is generated from the class name. + + The concrete request handler MUST have "RH" as the suffix in the class name. + + All exceptions raised by a RequestHandler should be an instance of RequestError. + Any other exception raised will be treated as a handler issue. + + If a Request is not supported by the handler, an UnsupportedRequest + should be raised with a reason. + + By default, some checks are done on the request in _validate() based on the following class variables: + - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes. + Any Request with an url scheme not in this list will raise an UnsupportedRequest. + + - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains + a proxy url with an url scheme not in this list will raise an UnsupportedRequest. + + - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum. + The above may be set to None to disable the checks. + + Parameters: + @param logger: logger instance + @param headers: HTTP Headers to include when sending requests. + @param cookiejar: Cookiejar to use for requests. + @param timeout: Socket timeout to use when sending requests. + @param proxies: Proxies to use for sending requests. + @param source_address: Client-side IP address to bind to for requests. + @param verbose: Print debug request and traffic information to stdout. + @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi). + @param client_cert: SSL client certificate configuration. + dict with {client_certificate, client_certificate_key, client_certificate_password} + @param verify: Verify SSL certificates + @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support. + + Some configuration options may be available for individual Requests too. In this case, + either the Request configuration option takes precedence or they are merged. + + Requests may have additional optional parameters defined as extensions. + RequestHandler subclasses may choose to support custom extensions. + + The following extensions are defined for RequestHandler: + - `cookiejar`: Cookiejar to use for this request + - `timeout`: socket timeout to use for this request + + Apart from the url protocol, proxies dict may contain the following keys: + - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol. + - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for. + Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`. + + """ + + _SUPPORTED_URL_SCHEMES = () + _SUPPORTED_PROXY_SCHEMES = () + _SUPPORTED_FEATURES = () + + def __init__( + self, *, + logger, # TODO(Grub4k): default logger + headers: HTTPHeaderDict = None, + cookiejar: CookieJar = None, + timeout: float | int | None = None, + proxies: dict = None, + source_address: str = None, + verbose: bool = False, + prefer_system_certs: bool = False, + client_cert: dict[str, str | None] = None, + verify: bool = True, + legacy_ssl_support: bool = False, + **_, + ): + + self._logger = logger + self.headers = headers or {} + self.cookiejar = cookiejar if cookiejar is not None else CookieJar() + self.timeout = float(timeout or 20) + self.proxies = proxies or {} + self.source_address = source_address + self.verbose = verbose + self.prefer_system_certs = prefer_system_certs + self._client_cert = client_cert or {} + self.verify = verify + self.legacy_ssl_support = legacy_ssl_support + super().__init__() + + def _make_sslcontext(self): + return make_ssl_context( + verify=self.verify, + legacy_support=self.legacy_ssl_support, + use_certifi=not self.prefer_system_certs, + **self._client_cert, + ) + + def _merge_headers(self, request_headers): + return HTTPHeaderDict(self.headers, request_headers) + + def _check_url_scheme(self, request: Request): + scheme = urllib.parse.urlparse(request.url).scheme.lower() + if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES: + raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"') + return scheme # for further processing + + def _check_proxies(self, proxies): + for proxy_key, proxy_url in proxies.items(): + if proxy_url is None: + continue + if proxy_key == 'no': + if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES: + raise UnsupportedRequest('"no" proxy is not supported') + continue + if ( + proxy_key == 'all' + and self._SUPPORTED_FEATURES is not None + and Features.ALL_PROXY not in self._SUPPORTED_FEATURES + ): + raise UnsupportedRequest('"all" proxy is not supported') + + # Unlikely this handler will use this proxy, so ignore. + # This is to allow a case where a proxy may be set for a protocol + # for one handler in which such protocol (and proxy) is not supported by another handler. + if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'): + continue + + if self._SUPPORTED_PROXY_SCHEMES is None: + # Skip proxy scheme checks + continue + + # Scheme-less proxies are not supported + if urllib.request._parse_proxy(proxy_url)[0] is None: + raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme') + + scheme = urllib.parse.urlparse(proxy_url).scheme.lower() + if scheme not in self._SUPPORTED_PROXY_SCHEMES: + raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"') + + def _check_cookiejar_extension(self, extensions): + if not extensions.get('cookiejar'): + return + if not isinstance(extensions['cookiejar'], CookieJar): + raise UnsupportedRequest('cookiejar is not a CookieJar') + + def _check_timeout_extension(self, extensions): + if extensions.get('timeout') is None: + return + if not isinstance(extensions['timeout'], (float, int)): + raise UnsupportedRequest('timeout is not a float or int') + + def _check_extensions(self, extensions): + self._check_cookiejar_extension(extensions) + self._check_timeout_extension(extensions) + + def _validate(self, request): + self._check_url_scheme(request) + self._check_proxies(request.proxies or self.proxies) + self._check_extensions(request.extensions) + + @wrap_request_errors + def validate(self, request: Request): + if not isinstance(request, Request): + raise TypeError('Expected an instance of Request') + self._validate(request) + + @wrap_request_errors + def send(self, request: Request) -> Response: + if not isinstance(request, Request): + raise TypeError('Expected an instance of Request') + return self._send(request) + + @abc.abstractmethod + def _send(self, request: Request): + """Handle a request from start to finish. Redefine in subclasses.""" + + def close(self): + pass + + @classproperty + def RH_NAME(cls): + return cls.__name__[:-2] + + @classproperty + def RH_KEY(cls): + assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"' + return cls.__name__[:-2] + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + +class Request: + """ + Represents a request to be made. + Partially backwards-compatible with urllib.request.Request. + + @param url: url to send. Will be sanitized. + @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None + @param headers: headers to send. + @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects. + @param query: URL query parameters to update the url with. + @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET + @param extensions: Dictionary of Request extensions to add, as supported by handlers. + """ + + def __init__( + self, + url: str, + data: RequestData = None, + headers: typing.Mapping = None, + proxies: dict = None, + query: dict = None, + method: str = None, + extensions: dict = None + ): + + self._headers = HTTPHeaderDict() + self._data = None + + if query: + url = update_url_query(url, query) + + self.url = url + self.method = method + if headers: + self.headers = headers + self.data = data # note: must be done after setting headers + self.proxies = proxies or {} + self.extensions = extensions or {} + + @property + def url(self): + return self._url + + @url.setter + def url(self, url): + if not isinstance(url, str): + raise TypeError('url must be a string') + elif url.startswith('//'): + url = 'http:' + url + self._url = escape_url(url) + + @property + def method(self): + return self._method or ('POST' if self.data is not None else 'GET') + + @method.setter + def method(self, method): + if method is None: + self._method = None + elif isinstance(method, str): + self._method = method.upper() + else: + raise TypeError('method must be a string') + + @property + def data(self): + return self._data + + @data.setter + def data(self, data: RequestData): + # Try catch some common mistakes + if data is not None and ( + not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping)) + ): + raise TypeError('data must be bytes, iterable of bytes, or a file-like object') + + if data == self._data and self._data is None: + self.headers.pop('Content-Length', None) + + # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data + if data != self._data: + if self._data is not None: + self.headers.pop('Content-Length', None) + self._data = data + + if self._data is None: + self.headers.pop('Content-Type', None) + + if 'Content-Type' not in self.headers and self._data is not None: + self.headers['Content-Type'] = 'application/x-www-form-urlencoded' + + @property + def headers(self) -> HTTPHeaderDict: + return self._headers + + @headers.setter + def headers(self, new_headers: Mapping): + """Replaces headers of the request. If not a CaseInsensitiveDict, it will be converted to one.""" + if isinstance(new_headers, HTTPHeaderDict): + self._headers = new_headers + elif isinstance(new_headers, Mapping): + self._headers = HTTPHeaderDict(new_headers) + else: + raise TypeError('headers must be a mapping') + + def update(self, url=None, data=None, headers=None, query=None): + self.data = data or self.data + self.headers.update(headers or {}) + self.url = update_url_query(url or self.url, query or {}) + + def copy(self): + return self.__class__( + url=self.url, + headers=copy.deepcopy(self.headers), + proxies=copy.deepcopy(self.proxies), + data=self._data, + extensions=copy.copy(self.extensions), + method=self._method, + ) + + +HEADRequest = functools.partial(Request, method='HEAD') +PUTRequest = functools.partial(Request, method='PUT') + + +class Response(io.IOBase): + """ + Base class for HTTP response adapters. + + By default, it provides a basic wrapper for a file-like response object. + + Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse. + + @param fp: Original, file-like, response. + @param url: URL that this is a response of. + @param headers: response headers. + @param status: Response HTTP status code. Default is 200 OK. + @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided. + """ + + def __init__( + self, + fp: typing.IO, + url: str, + headers: Mapping[str, str], + status: int = 200, + reason: str = None): + + self.fp = fp + self.headers = Message() + for name, value in headers.items(): + self.headers.add_header(name, value) + self.status = status + self.url = url + try: + self.reason = reason or HTTPStatus(status).phrase + except ValueError: + self.reason = None + + def readable(self): + return self.fp.readable() + + def read(self, amt: int = None) -> bytes: + # Expected errors raised here should be of type RequestError or subclasses. + # Subclasses should redefine this method with more precise error handling. + try: + return self.fp.read(amt) + except Exception as e: + raise TransportError(cause=e) from e + + def close(self): + self.fp.close() + return super().close() + + def get_header(self, name, default=None): + """Get header for name. + If there are multiple matching headers, return all seperated by comma.""" + headers = self.headers.get_all(name) + if not headers: + return default + if name.title() == 'Set-Cookie': + # Special case, only get the first one + # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1 + return headers[0] + return ', '.join(headers) + + # The following methods are for compatability reasons and are deprecated + @property + def code(self): + return self.status + + def getcode(self): + return self.status + + def geturl(self): + return self.url + + def info(self): + return self.headers + + def getheader(self, name, default=None): + return self.get_header(name, default) diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 89b484a220..6fe8afb925 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -1,9 +1,197 @@ -import http.client -import socket -import ssl +from __future__ import annotations + +import typing import urllib.error -network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] -if hasattr(ssl, 'CertificateError'): - network_exceptions.append(ssl.CertificateError) -network_exceptions = tuple(network_exceptions) +from ..utils import YoutubeDLError + +if typing.TYPE_CHECKING: + from .common import RequestHandler, Response + + +class RequestError(YoutubeDLError): + def __init__( + self, + msg: str | None = None, + cause: Exception | str | None = None, + handler: RequestHandler = None + ): + self.handler = handler + self.cause = cause + if not msg and cause: + msg = str(cause) + super().__init__(msg) + + +class UnsupportedRequest(RequestError): + """raised when a handler cannot handle a request""" + pass + + +class NoSupportingHandlers(RequestError): + """raised when no handlers can support a request for various reasons""" + + def __init__(self, unsupported_errors: list[UnsupportedRequest], unexpected_errors: list[Exception]): + self.unsupported_errors = unsupported_errors or [] + self.unexpected_errors = unexpected_errors or [] + + # Print a quick summary of the errors + err_handler_map = {} + for err in unsupported_errors: + err_handler_map.setdefault(err.msg, []).append(err.handler.RH_NAME) + + reason_str = ', '.join([f'{msg} ({", ".join(handlers)})' for msg, handlers in err_handler_map.items()]) + if unexpected_errors: + reason_str = ' + '.join(filter(None, [reason_str, f'{len(unexpected_errors)} unexpected error(s)'])) + + err_str = 'Unable to handle request' + if reason_str: + err_str += f': {reason_str}' + + super().__init__(msg=err_str) + + +class TransportError(RequestError): + """Network related errors""" + + +class HTTPError(RequestError): + def __init__(self, response: Response, redirect_loop=False): + self.response = response + self.status = response.status + self.reason = response.reason + self.redirect_loop = redirect_loop + msg = f'HTTP Error {response.status}: {response.reason}' + if redirect_loop: + msg += ' (redirect loop detected)' + + super().__init__(msg=msg) + + def close(self): + self.response.close() + + def __repr__(self): + return f'<HTTPError {self.status}: {self.reason}>' + + +class IncompleteRead(TransportError): + def __init__(self, partial, expected=None, **kwargs): + self.partial = partial + self.expected = expected + msg = f'{len(partial)} bytes read' + if expected is not None: + msg += f', {expected} more expected' + + super().__init__(msg=msg, **kwargs) + + def __repr__(self): + return f'<IncompleteRead: {self.msg}>' + + +class SSLError(TransportError): + pass + + +class CertificateVerifyError(SSLError): + """Raised when certificate validated has failed""" + pass + + +class ProxyError(TransportError): + pass + + +class _CompatHTTPError(urllib.error.HTTPError, HTTPError): + """ + Provides backwards compatibility with urllib.error.HTTPError. + Do not use this class directly, use HTTPError instead. + """ + + def __init__(self, http_error: HTTPError): + super().__init__( + url=http_error.response.url, + code=http_error.status, + msg=http_error.msg, + hdrs=http_error.response.headers, + fp=http_error.response + ) + self._closer.file = None # Disable auto close + self._http_error = http_error + HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop) + + @property + def status(self): + return self._http_error.status + + @status.setter + def status(self, value): + return + + @property + def reason(self): + return self._http_error.reason + + @reason.setter + def reason(self, value): + return + + @property + def headers(self): + return self._http_error.response.headers + + @headers.setter + def headers(self, value): + return + + def info(self): + return self.response.headers + + def getcode(self): + return self.status + + def geturl(self): + return self.response.url + + @property + def code(self): + return self.status + + @code.setter + def code(self, value): + return + + @property + def url(self): + return self.response.url + + @url.setter + def url(self, value): + return + + @property + def hdrs(self): + return self.response.headers + + @hdrs.setter + def hdrs(self, value): + return + + @property + def filename(self): + return self.response.url + + @filename.setter + def filename(self, value): + return + + def __getattr__(self, name): + return super().__getattr__(name) + + def __str__(self): + return str(self._http_error) + + def __repr__(self): + return repr(self._http_error) + + +network_exceptions = (HTTPError, TransportError) diff --git a/yt_dlp/utils/_deprecated.py b/yt_dlp/utils/_deprecated.py index ca0fb1614d..e55d42354a 100644 --- a/yt_dlp/utils/_deprecated.py +++ b/yt_dlp/utils/_deprecated.py @@ -10,16 +10,16 @@ from ._utils import preferredencoding +from ..networking._urllib import HTTPHandler # isort: split +from .networking import random_user_agent, std_headers # noqa: F401 from ..networking._urllib import PUTRequest # noqa: F401 from ..networking._urllib import SUPPORTED_ENCODINGS, HEADRequest # noqa: F401 -from ..networking._urllib import HTTPHandler as YoutubeDLHandler # noqa: F401 from ..networking._urllib import ProxyHandler as PerRequestProxyHandler # noqa: F401 from ..networking._urllib import RedirectHandler as YoutubeDLRedirectHandler # noqa: F401 from ..networking._urllib import make_socks_conn_class, update_Request # noqa: F401 from ..networking.exceptions import network_exceptions # noqa: F401 -from .networking import random_user_agent, std_headers # noqa: F401 def encodeFilename(s, for_subprocess=False): @@ -47,3 +47,12 @@ def decodeOption(optval): def error_to_compat_str(err): return str(err) + + +class YoutubeDLHandler(HTTPHandler): + def __init__(self, params, *args, **kwargs): + self._params = params + super().__init__(*args, **kwargs) + + +YoutubeDLHTTPSHandler = YoutubeDLHandler diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index d5704cadca..d0e3287166 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -15,8 +15,6 @@ import hmac import html.entities import html.parser -import http.client -import http.cookiejar import inspect import io import itertools @@ -897,6 +895,7 @@ def formatSeconds(secs, delim=':', msec=False): def make_HTTPS_handler(params, **kwargs): + from ._deprecated import YoutubeDLHTTPSHandler from ..networking._helper import make_ssl_context return YoutubeDLHTTPSHandler(params, context=make_ssl_context( verify=not params.get('nocheckcertificate'), @@ -1140,38 +1139,6 @@ class XAttrUnavailableError(YoutubeDLError): pass -class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler): - def __init__(self, params, https_conn_class=None, *args, **kwargs): - urllib.request.HTTPSHandler.__init__(self, *args, **kwargs) - self._https_conn_class = https_conn_class or http.client.HTTPSConnection - self._params = params - - def https_open(self, req): - kwargs = {} - conn_class = self._https_conn_class - - if hasattr(self, '_context'): # python > 2.6 - kwargs['context'] = self._context - if hasattr(self, '_check_hostname'): # python 3.x - kwargs['check_hostname'] = self._check_hostname - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - from ..networking._urllib import make_socks_conn_class - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - from ..networking._urllib import _create_http_connection - try: - return self.do_open( - functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs) - except urllib.error.URLError as e: - if (isinstance(e.reason, ssl.SSLError) - and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'): - raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect') - raise - - def is_path_like(f): return isinstance(f, (str, bytes, os.PathLike)) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index 95b54fabef..ac355ddc85 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -1,4 +1,9 @@ +import collections import random +import urllib.parse +import urllib.request + +from ._utils import remove_start def random_user_agent(): @@ -46,15 +51,67 @@ def random_user_agent(): return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) -std_headers = { +class HTTPHeaderDict(collections.UserDict, dict): + """ + Store and access keys case-insensitively. + The constructor can take multiple dicts, in which keys in the latter are prioritised. + """ + + def __init__(self, *args, **kwargs): + super().__init__() + for dct in args: + if dct is not None: + self.update(dct) + self.update(kwargs) + + def __setitem__(self, key, value): + super().__setitem__(key.title(), str(value)) + + def __getitem__(self, key): + return super().__getitem__(key.title()) + + def __delitem__(self, key): + super().__delitem__(key.title()) + + def __contains__(self, key): + return super().__contains__(key.title() if isinstance(key, str) else key) + + +std_headers = HTTPHeaderDict({ 'User-Agent': random_user_agent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Sec-Fetch-Mode': 'navigate', -} +}) -def clean_headers(headers): - if 'Youtubedl-no-compression' in headers: # compat - del headers['Youtubedl-no-compression'] +def clean_proxies(proxies: dict, headers: HTTPHeaderDict): + req_proxy = headers.pop('Ytdl-Request-Proxy', None) + if req_proxy: + proxies.clear() # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY + proxies['all'] = req_proxy + for proxy_key, proxy_url in proxies.items(): + if proxy_url == '__noproxy__': + proxies[proxy_key] = None + continue + if proxy_key == 'no': # special case + continue + if proxy_url is not None: + # Ensure proxies without a scheme are http. + proxy_scheme = urllib.request._parse_proxy(proxy_url)[0] + if proxy_scheme is None: + proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//') + + replace_scheme = { + 'socks5': 'socks5h', # compat: socks5 was treated as socks5h + 'socks': 'socks4' # compat: non-standard + } + if proxy_scheme in replace_scheme: + proxies[proxy_key] = urllib.parse.urlunparse( + urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme])) + + +def clean_headers(headers: HTTPHeaderDict): + if 'Youtubedl-No-Compression' in headers: # compat + del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' From 3d2623a898196640f7cc0fc8b70118ff19e6925d Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 9 Jul 2023 13:23:02 +0530 Subject: [PATCH 276/501] [compat, networking] Deprecate old functions (#2861) Authored by: coletdjnz, pukkandan --- test/test_download.py | 2 +- test/test_networking.py | 27 ++++++----- test/test_networking_utils.py | 64 ++++++++++++++++++++----- yt_dlp/YoutubeDL.py | 12 +++-- yt_dlp/__init__.py | 2 +- yt_dlp/compat/_deprecated.py | 1 - yt_dlp/compat/_legacy.py | 1 + yt_dlp/downloader/external.py | 7 ++- yt_dlp/downloader/f4m.py | 8 ++-- yt_dlp/downloader/fragment.py | 19 +++----- yt_dlp/downloader/hls.py | 2 +- yt_dlp/downloader/http.py | 41 ++++++++-------- yt_dlp/downloader/ism.py | 4 +- yt_dlp/downloader/niconico.py | 11 ++--- yt_dlp/downloader/youtube_live_chat.py | 10 ++-- yt_dlp/extractor/abematv.py | 3 +- yt_dlp/extractor/adn.py | 16 +++---- yt_dlp/extractor/adobepass.py | 20 ++++---- yt_dlp/extractor/ant1newsgr.py | 4 +- yt_dlp/extractor/archiveorg.py | 12 ++--- yt_dlp/extractor/atresplayer.py | 6 +-- yt_dlp/extractor/bbc.py | 14 +++--- yt_dlp/extractor/bilibili.py | 4 +- yt_dlp/extractor/bitchute.py | 2 +- yt_dlp/extractor/bravotv.py | 4 +- yt_dlp/extractor/brightcove.py | 6 +-- yt_dlp/extractor/canalplus.py | 2 +- yt_dlp/extractor/cbsnews.py | 2 +- yt_dlp/extractor/ceskatelevize.py | 30 ++++++------ yt_dlp/extractor/cinetecamilano.py | 4 +- yt_dlp/extractor/ciscowebex.py | 6 +-- yt_dlp/extractor/common.py | 41 +++++++++------- yt_dlp/extractor/crackle.py | 4 +- yt_dlp/extractor/crunchyroll.py | 4 +- yt_dlp/extractor/cultureunplugged.py | 6 +-- yt_dlp/extractor/dacast.py | 4 +- yt_dlp/extractor/dailymotion.py | 6 +-- yt_dlp/extractor/discovery.py | 6 +-- yt_dlp/extractor/dplay.py | 8 ++-- yt_dlp/extractor/eagleplatform.py | 6 +-- yt_dlp/extractor/eitb.py | 10 ++-- yt_dlp/extractor/eporner.py | 2 +- yt_dlp/extractor/facebook.py | 14 +++--- yt_dlp/extractor/fc2.py | 6 +-- yt_dlp/extractor/filmon.py | 14 +++--- yt_dlp/extractor/fox.py | 10 ++-- yt_dlp/extractor/foxsports.py | 5 +- yt_dlp/extractor/fujitv.py | 2 +- yt_dlp/extractor/funimation.py | 6 +-- yt_dlp/extractor/gdcvault.py | 15 ++---- yt_dlp/extractor/generic.py | 8 ++-- yt_dlp/extractor/globo.py | 2 +- yt_dlp/extractor/googledrive.py | 2 +- yt_dlp/extractor/hketv.py | 2 +- yt_dlp/extractor/hotnewhiphop.py | 14 ++---- yt_dlp/extractor/hotstar.py | 5 +- yt_dlp/extractor/hrti.py | 10 ++-- yt_dlp/extractor/ign.py | 17 +++---- yt_dlp/extractor/imggaming.py | 6 +-- yt_dlp/extractor/instagram.py | 6 +-- yt_dlp/extractor/iprima.py | 4 +- yt_dlp/extractor/kakao.py | 6 +-- yt_dlp/extractor/kick.py | 3 +- yt_dlp/extractor/kuwo.py | 2 +- yt_dlp/extractor/la7.py | 9 +--- yt_dlp/extractor/lbry.py | 4 +- yt_dlp/extractor/lecturio.py | 2 +- yt_dlp/extractor/lego.py | 4 +- yt_dlp/extractor/limelight.py | 6 +-- yt_dlp/extractor/linuxacademy.py | 15 +++--- yt_dlp/extractor/mediasite.py | 2 +- yt_dlp/extractor/megatvcom.py | 6 +-- yt_dlp/extractor/mgtv.py | 6 +-- yt_dlp/extractor/minds.py | 2 +- yt_dlp/extractor/miomio.py | 10 ++-- yt_dlp/extractor/mtv.py | 11 ++--- yt_dlp/extractor/nbc.py | 2 +- yt_dlp/extractor/nebula.py | 4 +- yt_dlp/extractor/neteasemusic.py | 6 +-- yt_dlp/extractor/niconico.py | 16 +++---- yt_dlp/extractor/njpwworld.py | 2 +- yt_dlp/extractor/nosvideo.py | 6 +-- yt_dlp/extractor/nowness.py | 8 ++-- yt_dlp/extractor/nrk.py | 5 +- yt_dlp/extractor/odkmedia.py | 6 +-- yt_dlp/extractor/odnoklassniki.py | 4 +- yt_dlp/extractor/orf.py | 2 +- yt_dlp/extractor/owncloud.py | 2 +- yt_dlp/extractor/packtpub.py | 11 ++--- yt_dlp/extractor/patreon.py | 6 +-- yt_dlp/extractor/peloton.py | 12 ++--- yt_dlp/extractor/piapro.py | 2 +- yt_dlp/extractor/pladform.py | 2 +- yt_dlp/extractor/platzi.py | 2 +- yt_dlp/extractor/playplustv.py | 14 ++---- yt_dlp/extractor/pornhub.py | 11 +++-- yt_dlp/extractor/puhutv.py | 8 ++-- yt_dlp/extractor/radiko.py | 2 +- yt_dlp/extractor/radiocanada.py | 6 +-- yt_dlp/extractor/rcs.py | 2 +- yt_dlp/extractor/rcti.py | 4 +- yt_dlp/extractor/recurbate.py | 5 +- yt_dlp/extractor/redbulltv.py | 6 +-- yt_dlp/extractor/redgifs.py | 4 +- yt_dlp/extractor/regiotv.py | 10 ++-- yt_dlp/extractor/rokfin.py | 4 +- yt_dlp/extractor/roosterteeth.py | 10 ++-- yt_dlp/extractor/rozhlas.py | 4 +- yt_dlp/extractor/rte.py | 6 +-- yt_dlp/extractor/rts.py | 4 +- yt_dlp/extractor/rumble.py | 4 +- yt_dlp/extractor/safari.py | 6 +-- yt_dlp/extractor/sbs.py | 2 +- yt_dlp/extractor/sevenplus.py | 10 ++-- yt_dlp/extractor/shahid.py | 8 ++-- yt_dlp/extractor/sina.py | 12 ++--- yt_dlp/extractor/sixplay.py | 2 +- yt_dlp/extractor/slideslive.py | 2 +- yt_dlp/extractor/sonyliv.py | 10 ++-- yt_dlp/extractor/soundcloud.py | 17 +++---- yt_dlp/extractor/teachable.py | 2 +- yt_dlp/extractor/telemundo.py | 9 ++-- yt_dlp/extractor/tennistv.py | 2 +- yt_dlp/extractor/tenplay.py | 9 ++-- yt_dlp/extractor/tfo.py | 8 +--- yt_dlp/extractor/theplatform.py | 4 +- yt_dlp/extractor/thisoldhouse.py | 4 +- yt_dlp/extractor/threeqsdn.py | 4 +- yt_dlp/extractor/tiktok.py | 4 +- yt_dlp/extractor/toutv.py | 6 +-- yt_dlp/extractor/triller.py | 4 +- yt_dlp/extractor/trueid.py | 6 +-- yt_dlp/extractor/tubetugraz.py | 8 ++-- yt_dlp/extractor/tubitv.py | 8 ++-- yt_dlp/extractor/tumblr.py | 2 +- yt_dlp/extractor/tunein.py | 4 +- yt_dlp/extractor/tv2.py | 10 ++-- yt_dlp/extractor/tvp.py | 4 +- yt_dlp/extractor/tvplay.py | 10 ++-- yt_dlp/extractor/tvplayer.py | 10 ++-- yt_dlp/extractor/twitcasting.py | 4 +- yt_dlp/extractor/twitch.py | 2 +- yt_dlp/extractor/twitter.py | 2 +- yt_dlp/extractor/udemy.py | 15 +++--- yt_dlp/extractor/vevo.py | 10 ++-- yt_dlp/extractor/vice.py | 10 ++-- yt_dlp/extractor/videocampus_sachsen.py | 4 +- yt_dlp/extractor/vidio.py | 2 +- yt_dlp/extractor/vidlii.py | 2 +- yt_dlp/extractor/viewlift.py | 6 +-- yt_dlp/extractor/viidea.py | 6 +-- yt_dlp/extractor/vimeo.py | 31 +++++------- yt_dlp/extractor/vk.py | 2 +- yt_dlp/extractor/vocaroo.py | 6 +-- yt_dlp/extractor/vodlocker.py | 12 ++--- yt_dlp/extractor/voot.py | 4 +- yt_dlp/extractor/vrt.py | 4 +- yt_dlp/extractor/vrv.py | 7 +-- yt_dlp/extractor/weibo.py | 2 +- yt_dlp/extractor/weverse.py | 8 ++-- yt_dlp/extractor/wistia.py | 6 +-- yt_dlp/extractor/wykop.py | 4 +- yt_dlp/extractor/xhamster.py | 2 +- yt_dlp/extractor/xtube.py | 4 +- yt_dlp/extractor/yesjapan.py | 9 ++-- yt_dlp/extractor/youtube.py | 15 +++--- yt_dlp/extractor/zaiko.py | 2 +- yt_dlp/extractor/zattoo.py | 5 +- yt_dlp/extractor/zype.py | 6 +-- yt_dlp/networking/common.py | 6 +++ yt_dlp/networking/exceptions.py | 22 ++++++++- yt_dlp/postprocessor/common.py | 11 ++--- yt_dlp/update.py | 9 ++-- yt_dlp/utils/_deprecated.py | 19 -------- yt_dlp/utils/_legacy.py | 62 +++++++++++++++++++++++- yt_dlp/utils/_utils.py | 47 +----------------- 176 files changed, 707 insertions(+), 729 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index fd7752cddf..6f00a4deda 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -160,7 +160,7 @@ def try_rm_tcs_files(tcs=None): force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one - if not isinstance(err.exc_info[1], (TransportError, UnavailableVideoError)) or (isinstance(err.exc_info[1], HTTPError) and err.exc_info[1].code == 503): + if not isinstance(err.exc_info[1], (TransportError, UnavailableVideoError)) or (isinstance(err.exc_info[1], HTTPError) and err.exc_info[1].status == 503): err.msg = f'{getattr(err, "msg", err)} ({tname})' raise diff --git a/test/test_networking.py b/test/test_networking.py index 147a4ff491..b60ed283be 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -1057,14 +1057,15 @@ def test_compat_request(self): urllib_req = urllib.request.Request('http://foo.bar', data=b'test', method='PUT', headers={'X-Test': '1'}) urllib_req.add_unredirected_header('Cookie', 'bob=bob') urllib_req.timeout = 2 - - req = ydl.urlopen(urllib_req).request - assert req.url == urllib_req.get_full_url() - assert req.data == urllib_req.data - assert req.method == urllib_req.get_method() - assert 'X-Test' in req.headers - assert 'Cookie' in req.headers - assert req.extensions.get('timeout') == 2 + with warnings.catch_warnings(): + warnings.simplefilter('ignore', category=DeprecationWarning) + req = ydl.urlopen(urllib_req).request + assert req.url == urllib_req.get_full_url() + assert req.data == urllib_req.data + assert req.method == urllib_req.get_method() + assert 'X-Test' in req.headers + assert 'Cookie' in req.headers + assert req.extensions.get('timeout') == 2 with pytest.raises(AssertionError): ydl.urlopen(None) @@ -1362,7 +1363,9 @@ def test_get_header(self): def test_compat(self): res = Response(io.BytesIO(b''), url='test://', status=404, headers={'test': 'test'}) - assert res.code == res.getcode() == res.status - assert res.geturl() == res.url - assert res.info() is res.headers - assert res.getheader('test') == res.get_header('test') + with warnings.catch_warnings(): + warnings.simplefilter('ignore', category=DeprecationWarning) + assert res.code == res.getcode() == res.status + assert res.geturl() == res.url + assert res.info() is res.headers + assert res.getheader('test') == res.get_header('test') diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index f9f876af3d..ef46f79ed0 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -8,11 +8,13 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import contextlib import io import platform import random import ssl import urllib.error +import warnings from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.dependencies import certifi @@ -202,20 +204,58 @@ def test_compat_http_error(self): assert isinstance(error, HTTPError) assert isinstance(error, urllib.error.HTTPError) - assert error.code == 403 - assert error.getcode() == 403 - assert error.hdrs is error.response.headers - assert error.info() is error.response.headers - assert error.headers is error.response.headers - assert error.filename == error.response.url - assert error.url == error.response.url - assert error.geturl() == error.response.url + @contextlib.contextmanager + def raises_deprecation_warning(): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + yield + + if len(w) == 0: + pytest.fail('Did not raise DeprecationWarning') + if len(w) > 1: + pytest.fail(f'Raised multiple warnings: {w}') + + if not issubclass(w[-1].category, DeprecationWarning): + pytest.fail(f'Expected DeprecationWarning, got {w[-1].category}') + w.clear() + + with raises_deprecation_warning(): + assert error.code == 403 + + with raises_deprecation_warning(): + assert error.getcode() == 403 + + with raises_deprecation_warning(): + assert error.hdrs is error.response.headers + + with raises_deprecation_warning(): + assert error.info() is error.response.headers + + with raises_deprecation_warning(): + assert error.headers is error.response.headers + + with raises_deprecation_warning(): + assert error.filename == error.response.url + + with raises_deprecation_warning(): + assert error.url == error.response.url + + with raises_deprecation_warning(): + assert error.geturl() == error.response.url # Passthrough file operations - assert error.read() == b'test' - assert not error.closed - # Technically Response operations are also passed through, which should not be used. - assert error.get_header('test') == 'test' + with raises_deprecation_warning(): + assert error.read() == b'test' + + with raises_deprecation_warning(): + assert not error.closed + + with raises_deprecation_warning(): + # Technically Response operations are also passed through, which should not be used. + assert error.get_header('test') == 'test' + + # Should not raise a warning + error.close() @pytest.mark.skipif( platform.python_implementation() == 'PyPy', reason='garbage collector works differently in pypy') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 29a18aef02..850eb8ae0a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -33,7 +33,7 @@ from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper from .minicurses import format_text -from .networking import Request, RequestDirector +from .networking import HEADRequest, Request, RequestDirector from .networking.common import _REQUEST_HANDLERS from .networking.exceptions import ( HTTPError, @@ -41,6 +41,7 @@ RequestError, SSLError, _CompatHTTPError, + network_exceptions, ) from .plugins import directories as plugin_directories from .postprocessor import _PLUGIN_CLASSES as plugin_pps @@ -80,7 +81,6 @@ ExtractorError, FormatSorter, GeoRestrictedError, - HEADRequest, ISO3166Utils, LazyList, MaxDownloadsReached, @@ -122,7 +122,6 @@ locked_file, make_archive_id, make_dir, - network_exceptions, number_of_digits, orderedSet, orderedSet_from_options, @@ -135,7 +134,6 @@ sanitize_filename, sanitize_path, sanitize_url, - std_headers, str_or_none, strftime_or_none, subtitles_filename, @@ -158,6 +156,7 @@ HTTPHeaderDict, clean_headers, clean_proxies, + std_headers, ) from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__ @@ -4019,6 +4018,9 @@ def urlopen(self, req): if isinstance(req, str): req = Request(req) elif isinstance(req, urllib.request.Request): + self.deprecation_warning( + 'Passing a urllib.request.Request object to YoutubeDL.urlopen() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') req = urllib_req_to_req(req) assert isinstance(req, Request) @@ -4242,7 +4244,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError) and err.code == 404: + if isinstance(err, HTTPError) and err.status == 404: self.to_screen(f'[info] {thumb_display_id.title()} does not exist') else: self.report_warning(f'Unable to download {thumb_display_id}: {err}') diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index b81277a572..991dbcda7e 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -57,11 +57,11 @@ read_stdin, render_table, setproctitle, - std_headers, traverse_obj, variadic, write_string, ) +from .utils.networking import std_headers from .YoutubeDL import YoutubeDL _IN_CLI = False diff --git a/yt_dlp/compat/_deprecated.py b/yt_dlp/compat/_deprecated.py index 342f1f80d6..14d37b2367 100644 --- a/yt_dlp/compat/_deprecated.py +++ b/yt_dlp/compat/_deprecated.py @@ -8,7 +8,6 @@ compat_b64decode = base64.b64decode -compat_HTTPError = urllib.error.HTTPError compat_urlparse = urllib.parse compat_parse_qs = urllib.parse.parse_qs compat_urllib_parse_unquote = urllib.parse.unquote diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index 83bf869a80..912907a021 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -70,6 +70,7 @@ def compat_setenv(key, value, env=os.environ): compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser compat_http_client = http.client compat_http_server = http.server +compat_HTTPError = urllib.error.HTTPError compat_input = input compat_integer_types = (int, ) compat_itertools_count = itertools.count diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index d4045e58f9..e307502db1 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -10,6 +10,7 @@ from .fragment import FragmentFD from ..compat import functools +from ..networking import Request from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor from ..utils import ( Popen, @@ -25,7 +26,6 @@ encodeFilename, find_available_port, remove_end, - sanitized_Request, traverse_obj, ) @@ -357,13 +357,12 @@ def aria2c_rpc(self, rpc_port, rpc_secret, method, params=()): 'method': method, 'params': [f'token:{rpc_secret}', *params], }).encode('utf-8') - request = sanitized_Request( + request = Request( f'http://localhost:{rpc_port}/jsonrpc', data=d, headers={ 'Content-Type': 'application/json', 'Content-Length': f'{len(d)}', - 'Ytdl-request-proxy': '__noproxy__', - }) + }, proxies={'all': None}) with self.ydl.urlopen(request) as r: resp = json.load(r) assert resp.get('id') == sanitycheck, 'Something went wrong with RPC server' diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py index 306f92192f..28cbba0169 100644 --- a/yt_dlp/downloader/f4m.py +++ b/yt_dlp/downloader/f4m.py @@ -3,11 +3,11 @@ import itertools import struct import time -import urllib.error import urllib.parse from .fragment import FragmentFD from ..compat import compat_etree_fromstring +from ..networking.exceptions import HTTPError from ..utils import fix_xml_ampersands, xpath_text @@ -312,7 +312,7 @@ def real_download(self, filename, info_dict): self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() + man_url = urlh.url # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 # and https://github.com/ytdl-org/youtube-dl/issues/7823) @@ -407,8 +407,8 @@ def real_download(self, filename, info_dict): if box_type == b'mdat': self._append_fragment(ctx, box_data) break - except urllib.error.HTTPError as err: - if live and (err.code == 404 or err.code == 410): + except HTTPError as err: + if live and (err.status == 404 or err.status == 410): # We didn't keep up with the live window. Continue # with the next available fragment. msg = 'Fragment %d unavailable' % frag_i diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 0698153269..b4b680dae1 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -1,24 +1,19 @@ import concurrent.futures import contextlib -import http.client import json import math import os import struct import time -import urllib.error from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import compat_os_name -from ..utils import ( - DownloadError, - RetryManager, - encodeFilename, - sanitized_Request, - traverse_obj, -) +from ..networking import Request +from ..networking.exceptions import HTTPError, IncompleteRead +from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj +from ..utils.networking import HTTPHeaderDict class HttpQuietDownloader(HttpFD): @@ -75,7 +70,7 @@ def report_skip_fragment(self, frag_index, err=None): def _prepare_url(self, info_dict, url): headers = info_dict.get('http_headers') - return sanitized_Request(url, None, headers) if headers else url + return Request(url, None, headers) if headers else url def _prepare_and_start_frag_download(self, ctx, info_dict): self._prepare_frag_download(ctx) @@ -457,7 +452,7 @@ def download_fragment(fragment, ctx): frag_index = ctx['fragment_index'] = fragment['frag_index'] ctx['last_error'] = None - headers = info_dict.get('http_headers', {}).copy() + headers = HTTPHeaderDict(info_dict.get('http_headers')) byte_range = fragment.get('byte_range') if byte_range: headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) @@ -477,7 +472,7 @@ def error_callback(err, count, retries): if not self._download_fragment( ctx, fragment['url'], info_dict, headers, info_dict.get('request_data')): return - except (urllib.error.HTTPError, http.client.IncompleteRead) as err: + except (HTTPError, IncompleteRead) as err: retry.error = err continue except DownloadError: # has own retry settings diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index ab7d496d42..d4b3f03200 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -75,7 +75,7 @@ def real_download(self, filename, info_dict): self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() + man_url = urlh.url s = urlh.read().decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 45d094721a..f5237443e2 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -1,10 +1,14 @@ import os import random import time -import urllib.error from .common import FileDownloader -from ..networking.exceptions import CertificateVerifyError, TransportError +from ..networking import Request +from ..networking.exceptions import ( + CertificateVerifyError, + HTTPError, + TransportError, +) from ..utils import ( ContentTooShortError, RetryManager, @@ -14,10 +18,10 @@ encodeFilename, int_or_none, parse_http_range, - sanitized_Request, try_call, write_xattr, ) +from ..utils.networking import HTTPHeaderDict class HttpFD(FileDownloader): @@ -36,10 +40,7 @@ class DownloadContext(dict): ctx.stream = None # Disable compression - headers = {'Accept-Encoding': 'identity'} - add_headers = info_dict.get('http_headers') - if add_headers: - headers.update(add_headers) + headers = HTTPHeaderDict({'Accept-Encoding': 'identity'}, info_dict.get('http_headers')) is_test = self.params.get('test', False) chunk_size = self._TEST_FILE_SIZE if is_test else ( @@ -110,10 +111,10 @@ def establish_connection(): if try_call(lambda: range_end >= ctx.content_len): range_end = ctx.content_len - 1 - request = sanitized_Request(url, request_data, headers) + request = Request(url, request_data, headers) has_range = range_start is not None if has_range: - request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}') + request.headers['Range'] = f'bytes={int(range_start)}-{int_or_none(range_end) or ""}' # Establish connection try: ctx.data = self.ydl.urlopen(request) @@ -144,17 +145,17 @@ def establish_connection(): self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' - ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) - except urllib.error.HTTPError as err: - if err.code == 416: + ctx.data_len = ctx.content_len = int_or_none(ctx.data.headers.get('Content-length', None)) + except HTTPError as err: + if err.status == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header ctx.data = self.ydl.urlopen( - sanitized_Request(url, request_data, headers)) - content_length = ctx.data.info()['Content-Length'] - except urllib.error.HTTPError as err: - if err.code < 500 or err.code >= 600: + Request(url, request_data, headers)) + content_length = ctx.data.headers['Content-Length'] + except HTTPError as err: + if err.status < 500 or err.status >= 600: raise else: # Examine the reported length @@ -182,7 +183,7 @@ def establish_connection(): ctx.resume_len = 0 ctx.open_mode = 'wb' return - elif err.code < 500 or err.code >= 600: + elif err.status < 500 or err.status >= 600: # Unexpected HTTP error raise raise RetryDownload(err) @@ -198,9 +199,9 @@ def close_stream(): ctx.stream = None def download(): - data_len = ctx.data.info().get('Content-length') + data_len = ctx.data.headers.get('Content-length') - if ctx.data.info().get('Content-encoding'): + if ctx.data.headers.get('Content-encoding'): # Content-encoding is present, Content-length is not reliable anymore as we are # doing auto decompression. (See: https://github.com/yt-dlp/yt-dlp/pull/6176) data_len = None @@ -345,7 +346,7 @@ def retry(e): # Update file modification time if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.headers.get('last-modified', None)) self._hook_progress({ 'downloaded_bytes': byte_counter, diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py index a157a8ad93..dd688f586d 100644 --- a/yt_dlp/downloader/ism.py +++ b/yt_dlp/downloader/ism.py @@ -2,9 +2,9 @@ import io import struct import time -import urllib.error from .fragment import FragmentFD +from ..networking.exceptions import HTTPError from ..utils import RetryManager u8 = struct.Struct('>B') @@ -271,7 +271,7 @@ def real_download(self, filename, info_dict): write_piff_header(ctx['dest_stream'], info_dict['_download_params']) extra_state['ism_track_written'] = True self._append_fragment(ctx, frag_content) - except urllib.error.HTTPError as err: + except HTTPError as err: retry.error = err continue diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 7d8575c2a4..5720f6eb8f 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -5,13 +5,8 @@ from . import get_suitable_downloader from .common import FileDownloader from .external import FFmpegFD -from ..utils import ( - DownloadError, - WebSocketsWrapper, - sanitized_Request, - str_or_none, - try_get, -) +from ..networking import Request +from ..utils import DownloadError, WebSocketsWrapper, str_or_none, try_get class NiconicoDmcFD(FileDownloader): @@ -33,7 +28,7 @@ def real_download(self, filename, info_dict): heartbeat_data = heartbeat_info_dict['data'].encode() heartbeat_interval = heartbeat_info_dict.get('interval', 30) - request = sanitized_Request(heartbeat_url, heartbeat_data) + request = Request(heartbeat_url, heartbeat_data) def heartbeat(): try: diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py index 5928fecf0b..c7a86374aa 100644 --- a/yt_dlp/downloader/youtube_live_chat.py +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -1,8 +1,8 @@ import json import time -import urllib.error from .fragment import FragmentFD +from ..networking.exceptions import HTTPError from ..utils import ( RegexNotFoundError, RetryManager, @@ -10,6 +10,7 @@ int_or_none, try_get, ) +from ..utils.networking import HTTPHeaderDict class YoutubeLiveChatFD(FragmentFD): @@ -37,10 +38,7 @@ def real_download(self, filename, info_dict): start_time = int(time.time() * 1000) def dl_fragment(url, data=None, headers=None): - http_headers = info_dict.get('http_headers', {}) - if headers: - http_headers = http_headers.copy() - http_headers.update(headers) + http_headers = HTTPHeaderDict(info_dict.get('http_headers'), headers) return self._download_fragment(ctx, url, info_dict, http_headers, data) def parse_actions_replay(live_chat_continuation): @@ -129,7 +127,7 @@ def download_and_parse_fragment(url, frag_index, request_data=None, headers=None or frag_index == 1 and try_refresh_replay_beginning or parse_actions_replay) return (True, *func(live_chat_continuation)) - except urllib.error.HTTPError as err: + except HTTPError as err: retry.error = err continue return False, None, None, None diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index c9166b6b8c..98ece8da7d 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -22,7 +22,6 @@ int_or_none, intlist_to_bytes, OnDemandPagedList, - request_to_url, time_seconds, traverse_obj, update_url_query, @@ -137,7 +136,7 @@ def _get_videokey_from_ticket(self, ticket): return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) def abematv_license_open(self, url): - url = request_to_url(url) + url = url.get_full_url() if isinstance(url, urllib.request.Request) else url ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) return urllib.response.addinfourl(io.BytesIO(response_data), headers={ diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index f1f55e87fc..b59dbc8500 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -6,10 +6,8 @@ from .common import InfoExtractor from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import ( - compat_HTTPError, - compat_b64decode, -) +from ..compat import compat_b64decode +from ..networking.exceptions import HTTPError from ..utils import ( ass_subtitles_timecode, bytes_to_intlist, @@ -142,9 +140,9 @@ def _perform_login(self, username, password): self._HEADERS = {'authorization': 'Bearer ' + access_token} except ExtractorError as e: message = None - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: resp = self._parse_json( - e.cause.read().decode(), None, fatal=False) or {} + e.cause.response.read().decode(), None, fatal=False) or {} message = resp.get('message') or resp.get('code') self.report_warning(message or self._LOGIN_ERR_MESSAGE) @@ -195,14 +193,14 @@ def _real_extract(self, url): }) break except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise e - if e.cause.code == 401: + if e.cause.status == 401: # This usually goes away with a different random pkcs1pad, so retry continue - error = self._parse_json(e.cause.read(), video_id) + error = self._parse_json(e.cause.response.read(), video_id) message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index 722a534ed6..5eed0ca226 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -2,11 +2,11 @@ import json import re import time -import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor from ..compat import compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( NO_DEFAULT, ExtractorError, @@ -1394,7 +1394,7 @@ def post_form(form_page_res, note, data={}): form_page, urlh = form_page_res post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') if not re.match(r'https?://', post_url): - post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + post_url = compat_urlparse.urljoin(urlh.url, post_url) form_data = self._hidden_inputs(form_page) form_data.update(data) return self._download_webpage_handle( @@ -1619,7 +1619,7 @@ def extract_redirect_url(html, url=None, fatal=False): hidden_data['history'] = 1 provider_login_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending first bookend', + urlh.url, video_id, 'Sending first bookend', query=hidden_data) provider_association_redirect, urlh = post_form( @@ -1629,7 +1629,7 @@ def extract_redirect_url(html, url=None, fatal=False): }) provider_refresh_redirect_url = extract_redirect_url( - provider_association_redirect, url=urlh.geturl()) + provider_association_redirect, url=urlh.url) last_bookend_page, urlh = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1638,7 +1638,7 @@ def extract_redirect_url(html, url=None, fatal=False): hidden_data['history'] = 3 mvpd_confirm_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending final bookend', + urlh.url, video_id, 'Sending final bookend', query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1652,7 +1652,7 @@ def extract_redirect_url(html, url=None, fatal=False): hidden_data['history_val'] = 1 provider_login_redirect_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending First Bookend', + urlh.url, video_id, 'Sending First Bookend', query=hidden_data) provider_login_redirect_page, urlh = provider_login_redirect_page_res @@ -1680,7 +1680,7 @@ def extract_redirect_url(html, url=None, fatal=False): }) provider_refresh_redirect_url = extract_redirect_url( - provider_association_redirect, url=urlh.geturl()) + provider_association_redirect, url=urlh.url) last_bookend_page, urlh = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1690,7 +1690,7 @@ def extract_redirect_url(html, url=None, fatal=False): hidden_data['history_val'] = 3 mvpd_confirm_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending Final Bookend', + urlh.url, video_id, 'Sending Final Bookend', query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1699,7 +1699,7 @@ def extract_redirect_url(html, url=None, fatal=False): # based redirect that should be followed. provider_redirect_page, urlh = provider_redirect_page_res provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) + provider_redirect_page, url=urlh.url) if provider_refresh_redirect_url: provider_redirect_page_res = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1724,7 +1724,7 @@ def extract_redirect_url(html, url=None, fatal=False): 'requestor_id': requestor_id, }), headers=mvpd_headers) except ExtractorError as e: - if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if not mso_id and isinstance(e.cause, HTTPError) and e.cause.status == 401: raise_mvpd_required() raise if '<pendingLogout' in session: diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py index 7b384b22d0..217e3acc43 100644 --- a/yt_dlp/extractor/ant1newsgr.py +++ b/yt_dlp/extractor/ant1newsgr.py @@ -1,8 +1,8 @@ import urllib.parse from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, @@ -121,7 +121,7 @@ def _real_extract(self, url): canonical_url = self._request_webpage( HEADRequest(url), video_id, note='Resolve canonical player URL', - errnote='Could not resolve canonical player URL').geturl() + errnote='Could not resolve canonical player URL').url _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) cid = urllib.parse.parse_qs(query)['cid'][0] diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 4ccd398257..2541cd6fd8 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -1,16 +1,16 @@ import json import re -import urllib.error import urllib.parse from .common import InfoExtractor from .naver import NaverBaseIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE -from ..compat import compat_HTTPError, compat_urllib_parse_unquote +from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, - HEADRequest, bug_reports_message, clean_html, dict_get, @@ -899,7 +899,7 @@ def _real_extract(self, url): video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: self.raise_no_formats( 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) else: @@ -926,7 +926,7 @@ def _real_extract(self, url): info['thumbnails'] = self._extract_thumbnails(video_id) if urlh: - url = compat_urllib_parse_unquote(urlh.geturl()) + url = compat_urllib_parse_unquote(urlh.url) video_file_url_qs = parse_qs(url) # Attempt to recover any ext & format info from playback url & response headers format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} @@ -1052,7 +1052,7 @@ def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs): try: return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: raise ExtractorError('Page was not archived', expected=True) retry.error = e continue diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index a20e7f9889..3a44e5265b 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -34,8 +34,8 @@ class AtresPlayerIE(InfoExtractor): _API_BASE = 'https://api.atresplayer.com/' def _handle_error(self, e, code): - if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: - error = self._parse_json(e.cause.read(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == code: + error = self._parse_json(e.cause.response.read(), None) if error.get('error') == 'required_registered': self.raise_login_required() raise ExtractorError(error['error_description'], expected=True) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 9d28e70a3a..a55cdef2b8 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -2,11 +2,11 @@ import itertools import json import re -import urllib.error import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str, compat_urlparse +from ..compat import compat_str, compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -277,7 +277,7 @@ def _perform_login(self, username, password): post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Referer': self._LOGIN_URL}) - if self._LOGIN_URL in urlh.geturl(): + if self._LOGIN_URL in urlh.url: error = clean_html(get_element_by_class('form-message', response)) if error: raise ExtractorError( @@ -388,8 +388,8 @@ def _process_media_selector(self, media_selection, programme_id): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], urllib.error.HTTPError) - and e.exc_info[1].code in (403, 404)): + if not (isinstance(e.exc_info[1], HTTPError) + and e.exc_info[1].status in (403, 404)): raise fmts = [] formats.extend(fmts) @@ -472,7 +472,7 @@ def _download_playlist(self, playlist_id): return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404): raise # fallback to legacy playlist @@ -983,7 +983,7 @@ def _real_extract(self, url): # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: continue raise if entry: diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index e8714a33ab..cb7ab2a174 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -4,11 +4,11 @@ import itertools import math import time -import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from ..dependencies import Cryptodome +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, @@ -614,7 +614,7 @@ def fetch_page(page_idx): response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', playlist_id, note=f'Downloading page {page_idx}', query=query) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index a6779505e5..0805b8b46f 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -2,9 +2,9 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, clean_html, get_element_by_class, diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py index 13cc1927f1..419fe8c9c8 100644 --- a/yt_dlp/extractor/bravotv.py +++ b/yt_dlp/extractor/bravotv.py @@ -1,6 +1,6 @@ from .adobepass import AdobePassIE +from ..networking import HEADRequest from ..utils import ( - HEADRequest, extract_attributes, float_or_none, get_element_html_by_class, @@ -155,7 +155,7 @@ def _real_extract(self, url): chapters = None m3u8_url = self._request_webpage(HEADRequest( - update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').geturl() + update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url if 'mpeg_cenc' in m3u8_url: self.report_drm(video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index cd0e8ff275..61b18412d4 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -7,10 +7,10 @@ from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_HTTPError, compat_parse_qs, compat_urlparse, ) +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, dict_get, @@ -915,8 +915,8 @@ def extract_policy_key(): json_data = self._download_json(api_url, video_id, headers=headers) break except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): + json_data = self._parse_json(e.cause.response.read().decode(), video_id)[0] message = json_data.get('message') or json_data['error_code'] if json_data.get('error_subcode') == 'CLIENT_GEO': self.raise_geo_restricted(msg=message) diff --git a/yt_dlp/extractor/canalplus.py b/yt_dlp/extractor/canalplus.py index b7e2f9dd46..3ff5c3fbfc 100644 --- a/yt_dlp/extractor/canalplus.py +++ b/yt_dlp/extractor/canalplus.py @@ -64,7 +64,7 @@ def _real_extract(self, url): # response = self._request_webpage( # HEADRequest(fmt_url), video_id, # 'Checking if the video is georestricted') - # if '/blocage' in response.geturl(): + # if '/blocage' in response.url: # raise ExtractorError( # 'The video is not available in your country', # expected=True) diff --git a/yt_dlp/extractor/cbsnews.py b/yt_dlp/extractor/cbsnews.py index 65ecc62f02..5a8ebb8476 100644 --- a/yt_dlp/extractor/cbsnews.py +++ b/yt_dlp/extractor/cbsnews.py @@ -7,9 +7,9 @@ from .anvato import AnvatoIE from .common import InfoExtractor from .paramountplus import ParamountPlusIE +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, UserNotLive, determine_ext, float_or_none, diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index be2b0bb433..8390160a0d 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -1,20 +1,20 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) +from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse +from ..networking import Request from ..utils import ( ExtractorError, float_or_none, - sanitized_Request, str_or_none, traverse_obj, urlencode_postdata, - USER_AGENTS, ) +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' @@ -97,7 +97,7 @@ class CeskaTelevizeIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(url, playlist_id) - parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + parsed_url = compat_urllib_parse_urlparse(urlh.url) site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') playlist_title = self._og_search_title(webpage, default=None) if site_name and playlist_title: @@ -163,16 +163,16 @@ def _real_extract(self, url): entries = [] for user_agent in (None, USER_AGENTS['Safari']): - req = sanitized_Request( + req = Request( 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') + req.headers['Content-type'] = 'application/x-www-form-urlencoded' + req.headers['x-addr'] = '127.0.0.1' + req.headers['X-Requested-With'] = 'XMLHttpRequest' if user_agent: - req.add_header('User-Agent', user_agent) - req.add_header('Referer', url) + req.headers['User-Agent'] = user_agent + req.headers['Referer'] = url playlistpage = self._download_json(req, playlist_id, fatal=False) @@ -183,8 +183,8 @@ def _real_extract(self, url): if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) + req = Request(compat_urllib_parse_unquote(playlist_url)) + req.headers['Referer'] = url playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: diff --git a/yt_dlp/extractor/cinetecamilano.py b/yt_dlp/extractor/cinetecamilano.py index 5e770ebac2..9cffa11e81 100644 --- a/yt_dlp/extractor/cinetecamilano.py +++ b/yt_dlp/extractor/cinetecamilano.py @@ -1,6 +1,6 @@ import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -40,7 +40,7 @@ def _real_extract(self, url): 'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or '' }) except ExtractorError as e: - if ((isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 500) + if ((isinstance(e.cause, HTTPError) and e.cause.status == 500) or isinstance(e.cause, json.JSONDecodeError)): self.raise_login_required(method='cookies') raise diff --git a/yt_dlp/extractor/ciscowebex.py b/yt_dlp/extractor/ciscowebex.py index 40430505d6..85585dffbb 100644 --- a/yt_dlp/extractor/ciscowebex.py +++ b/yt_dlp/extractor/ciscowebex.py @@ -33,7 +33,7 @@ def _real_extract(self, url): if rcid: webpage = self._download_webpage(url, None, note='Getting video ID') url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url') - url = self._request_webpage(url, None, note='Resolving final URL').geturl() + url = self._request_webpage(url, None, note='Resolving final URL').url mobj = self._match_valid_url(url) subdomain = mobj.group('subdomain') siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') @@ -49,7 +49,7 @@ def _real_extract(self, url): 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429)) - if urlh.getcode() == 403: + if urlh.status == 403: if stream['code'] == 53004: self.raise_login_required() if stream['code'] == 53005: @@ -59,7 +59,7 @@ def _real_extract(self, url): 'This video is protected by a password, use the --video-password option', expected=True) raise ExtractorError(f'{self.IE_NAME} said: {stream["code"]} - {stream["message"]}', expected=True) - if urlh.getcode() == 429: + if urlh.status == 429: self.raise_login_required( f'{self.IE_NAME} asks you to solve a CAPTCHA. Solve CAPTCHA in browser and', method='cookies') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 63156d3ac9..d449187764 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -31,8 +31,12 @@ from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media from ..downloader.hls import HlsFD -from ..networking.common import HEADRequest, Request -from ..networking.exceptions import network_exceptions +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) from ..utils import ( IDENTITY, JSON_LD_RE, @@ -729,7 +733,7 @@ def extract(self, url): e.ie = e.ie or self.IE_NAME, e.traceback = e.traceback or sys.exc_info()[2] raise - except http.client.IncompleteRead as e: + except IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -788,16 +792,19 @@ def IE_NAME(cls): @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, urllib.error.HTTPError) + assert isinstance(err, HTTPError) if expected_status is None: return False elif callable(expected_status): - return expected_status(err.code) is True + return expected_status(err.status) is True else: - return err.code in variadic(expected_status) + return err.status in variadic(expected_status) def _create_request(self, url_or_request, data=None, headers=None, query=None): if isinstance(url_or_request, urllib.request.Request): + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') url_or_request = urllib_req_to_req(url_or_request) elif not isinstance(url_or_request, Request): url_or_request = Request(url_or_request) @@ -839,7 +846,7 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa try: return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError): + if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): return err.response @@ -973,11 +980,11 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) + self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.geturl(), video_id) + filename = self._request_dump_filename(urlh.url, video_id) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1109,7 +1116,7 @@ def _download_webpage( while True: try: return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) - except http.client.IncompleteRead as e: + except IncompleteRead as e: try_count += 1 if try_count >= tries: raise e @@ -1806,7 +1813,7 @@ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality= return [] manifest, urlh = res - manifest_url = urlh.geturl() + manifest_url = urlh.url return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, @@ -1965,7 +1972,7 @@ def _extract_m3u8_formats_and_subtitles( return [], {} m3u8_doc, urlh = res - m3u8_url = urlh.geturl() + m3u8_url = urlh.url return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, @@ -2243,7 +2250,7 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4 return [], {} smil, urlh = res - smil_url = urlh.geturl() + smil_url = urlh.url namespace = self._parse_smil_namespace(smil) @@ -2266,7 +2273,7 @@ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): return {} smil, urlh = res - smil_url = urlh.geturl() + smil_url = urlh.url return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) @@ -2458,7 +2465,7 @@ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): return [] xspf, urlh = res - xspf_url = urlh.geturl() + xspf_url = urlh.url return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, @@ -2529,7 +2536,7 @@ def _extract_mpd_formats_and_subtitles( return [], {} # We could have been redirected to a new url when we retrieved our mpd file. - mpd_url = urlh.geturl() + mpd_url = urlh.url mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( @@ -2900,7 +2907,7 @@ def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, not if ism_doc is None: return [], {} - return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id) def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ diff --git a/yt_dlp/extractor/crackle.py b/yt_dlp/extractor/crackle.py index 46100151a9..1ef90b5a07 100644 --- a/yt_dlp/extractor/crackle.py +++ b/yt_dlp/extractor/crackle.py @@ -4,7 +4,7 @@ import time from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, float_or_none, @@ -113,7 +113,7 @@ def _real_extract(self, url): errnote='Unable to download media JSON') except ExtractorError as e: # 401 means geo restriction, trying next country - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: continue raise diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 910504ed29..adb3d5dcf6 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,7 +1,7 @@ import base64 -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -114,7 +114,7 @@ def _call_api(self, path, internal_id, lang, note='api', query={}): result = self._call_base_api( path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) except ExtractorError as error: - if isinstance(error.cause, urllib.error.HTTPError) and error.cause.code == 404: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: return None raise diff --git a/yt_dlp/extractor/cultureunplugged.py b/yt_dlp/extractor/cultureunplugged.py index 2fb22800f3..9c8509f1f3 100644 --- a/yt_dlp/extractor/cultureunplugged.py +++ b/yt_dlp/extractor/cultureunplugged.py @@ -1,10 +1,8 @@ import time from .common import InfoExtractor -from ..utils import ( - int_or_none, - HEADRequest, -) +from ..networking import HEADRequest +from ..utils import int_or_none class CultureUnpluggedIE(InfoExtractor): diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py index cf683bad48..4e81aa4a7b 100644 --- a/yt_dlp/extractor/dacast.py +++ b/yt_dlp/extractor/dacast.py @@ -1,9 +1,9 @@ import hashlib import re import time -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, classproperty, @@ -105,7 +105,7 @@ def _real_extract(self, url): formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls') except ExtractorError as e: # CDN will randomly respond with 403 - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: retry.error = e continue raise diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 2a44718fb5..21263d41b0 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -68,9 +68,9 @@ def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): None, 'Downloading Access Token', data=urlencode_postdata(data))['access_token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError(self._parse_json( - e.cause.read().decode(), xid)['error_description'], expected=True) + e.cause.response.read().decode(), xid)['error_description'], expected=True) raise self._set_dailymotion_cookie('access_token' if username else 'client_token', token) self._HEADERS['Authorization'] = 'Bearer ' + token diff --git a/yt_dlp/extractor/discovery.py b/yt_dlp/extractor/discovery.py index e6e109d5c5..75b464353b 100644 --- a/yt_dlp/extractor/discovery.py +++ b/yt_dlp/extractor/discovery.py @@ -3,8 +3,8 @@ from .discoverygo import DiscoveryGoBaseIE from ..compat import compat_urllib_parse_unquote +from ..networking.exceptions import HTTPError from ..utils import ExtractorError -from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): @@ -100,9 +100,9 @@ def _real_extract(self, url): self._API_BASE_URL + 'streaming/video/' + video_id, display_id, 'Downloading streaming JSON metadata', headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): e_description = self._parse_json( - e.cause.read().decode(), display_id)['description'] + e.cause.response.read().decode(), display_id)['description'] if 'resource not available for country' in e_description: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) if 'Authorized Networks' in e_description: diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index cf6d149342..6404752f7e 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -2,7 +2,7 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -39,7 +39,7 @@ def _get_auth(self, disco_base, display_id, realm, needs_device_id=True): return f'Bearer {token}' def _process_errors(self, e, geo_countries): - info = self._parse_json(e.cause.read().decode('utf-8'), None) + info = self._parse_json(e.cause.response.read().decode('utf-8'), None) error = info['errors'][0] error_code = error.get('code') if error_code == 'access.denied.geoblocked': @@ -87,7 +87,7 @@ def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domai 'include': 'images,primaryChannel,show,tags' }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self._process_errors(e, geo_countries) raise video_id = video['data']['id'] @@ -99,7 +99,7 @@ def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domai streaming = self._download_video_playback_info( disco_base, video_id, headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self._process_errors(e, geo_countries) raise for format_dict in streaming: diff --git a/yt_dlp/extractor/eagleplatform.py b/yt_dlp/extractor/eagleplatform.py index 9ebd24d808..739d17912a 100644 --- a/yt_dlp/extractor/eagleplatform.py +++ b/yt_dlp/extractor/eagleplatform.py @@ -2,7 +2,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -111,8 +111,8 @@ def _download_json(self, url_or_request, video_id, *args, **kwargs): response = super(EaglePlatformIE, self)._download_json( url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + if isinstance(ee.cause, HTTPError): + response = self._parse_json(ee.cause.response.read().decode('utf-8'), video_id) self._handle_error(response) raise return response diff --git a/yt_dlp/extractor/eitb.py b/yt_dlp/extractor/eitb.py index bd027da6b4..66afbb6bb2 100644 --- a/yt_dlp/extractor/eitb.py +++ b/yt_dlp/extractor/eitb.py @@ -1,10 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - parse_iso8601, - sanitized_Request, -) +from ..networking import Request +from ..utils import float_or_none, int_or_none, parse_iso8601 class EitbIE(InfoExtractor): @@ -54,7 +50,7 @@ def _real_extract(self, url): hls_url = media.get('HLS_SURL') if hls_url: - request = sanitized_Request( + request = Request( 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', headers={'Referer': url}) token_data = self._download_json( diff --git a/yt_dlp/extractor/eporner.py b/yt_dlp/extractor/eporner.py index a2337979b8..aee2dee581 100644 --- a/yt_dlp/extractor/eporner.py +++ b/yt_dlp/extractor/eporner.py @@ -52,7 +52,7 @@ def _real_extract(self, url): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(urlh.geturl()) + video_id = self._match_id(urlh.url) hash = self._search_regex( r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 9d871eb286..9f4d3fb789 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -8,6 +8,8 @@ compat_str, compat_urllib_parse_unquote, ) +from ..networking import Request +from ..networking.exceptions import network_exceptions from ..utils import ( ExtractorError, clean_html, @@ -19,11 +21,9 @@ int_or_none, js_to_json, merge_dicts, - network_exceptions, parse_count, parse_qs, qualities, - sanitized_Request, traverse_obj, try_get, url_or_none, @@ -319,7 +319,7 @@ class FacebookIE(InfoExtractor): } def _perform_login(self, username, password): - login_page_req = sanitized_Request(self._LOGIN_URL) + login_page_req = Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', @@ -340,8 +340,8 @@ def _perform_login(self, username, password): 'timezone': '-60', 'trynum': '1', } - request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(self._LOGIN_URL, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' try: login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') @@ -367,8 +367,8 @@ def _perform_login(self, username, password): 'h': h, 'name_action_selected': 'dont_save', } - check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) - check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) + check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded' check_response = self._download_webpage(check_req, None, note='Confirming login') if re.search(r'id="checkpointSubmitButton"', check_response) is not None: diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index dd5e088fc1..ba19b6cab4 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -3,11 +3,11 @@ from .common import InfoExtractor from ..compat import compat_parse_qs from ..dependencies import websockets +from ..networking import Request from ..utils import ( ExtractorError, WebSocketsWrapper, js_to_json, - sanitized_Request, traverse_obj, update_url_query, urlencode_postdata, @@ -57,7 +57,7 @@ def _login(self): } login_data = urlencode_postdata(login_form_strs) - request = sanitized_Request( + request = Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in') @@ -66,7 +66,7 @@ def _login(self): return False # this is also needed - login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done') + login_redir = Request('http://id.fc2.com/?mode=redirect&login=done') self._download_webpage( login_redir, None, note='Login redirect', errnote='Login redirect failed') diff --git a/yt_dlp/extractor/filmon.py b/yt_dlp/extractor/filmon.py index 9a93cb9840..0cd18f4947 100644 --- a/yt_dlp/extractor/filmon.py +++ b/yt_dlp/extractor/filmon.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( qualities, strip_or_none, @@ -40,8 +38,8 @@ def _real_extract(self, url): 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, video_id)['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['reason'] raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) raise @@ -124,8 +122,8 @@ def _real_extract(self, url): channel_data = self._download_json( 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), channel_id)['message'] raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) raise diff --git a/yt_dlp/extractor/fox.py b/yt_dlp/extractor/fox.py index 15c0c48c17..8fb4ada6be 100644 --- a/yt_dlp/extractor/fox.py +++ b/yt_dlp/extractor/fox.py @@ -3,10 +3,10 @@ from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urllib_parse_unquote, ) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -68,9 +68,9 @@ def _call_api(self, path, video_id, data=None): 'https://api3.fox.com/v2.0/' + path, video_id, data=data, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: entitlement_issues = self._parse_json( - e.cause.read().decode(), video_id)['entitlementIssues'] + e.cause.response.read().decode(), video_id)['entitlementIssues'] for e in entitlement_issues: if e.get('errorCode') == 1005: raise ExtractorError( @@ -123,8 +123,8 @@ def _real_extract(self, url): try: m3u8_url = self._download_json(release_url, video_id)['playURL'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), video_id) if error.get('exception') == 'GeoLocationBlocked': self.raise_geo_restricted(countries=['US']) raise ExtractorError(error['description'], expected=True) diff --git a/yt_dlp/extractor/foxsports.py b/yt_dlp/extractor/foxsports.py index f906a1718d..8e89ccf841 100644 --- a/yt_dlp/extractor/foxsports.py +++ b/yt_dlp/extractor/foxsports.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from .uplynk import UplynkPreplayIE -from ..utils import HEADRequest, float_or_none, make_archive_id, smuggle_url +from ..networking import HEADRequest +from ..utils import float_or_none, make_archive_id, smuggle_url class FoxSportsIE(InfoExtractor): @@ -35,7 +36,7 @@ def _real_extract(self, url): 'x-api-key': 'cf289e299efdfa39fb6316f259d1de93', }) preplay_url = self._request_webpage( - HEADRequest(data['url']), video_id, 'Fetching preplay URL').geturl() + HEADRequest(data['url']), video_id, 'Fetching preplay URL').url return { '_type': 'url_transparent', diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index 668bb2743c..77e826e2db 100644 --- a/yt_dlp/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py @@ -1,5 +1,5 @@ -from ..utils import HEADRequest from .common import InfoExtractor +from ..networking import HEADRequest class FujiTVFODPlus7IE(InfoExtractor): diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 47c316664a..41de85cc64 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -3,7 +3,7 @@ import string from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -46,8 +46,8 @@ def _perform_login(self, username, password): })) FunimationBaseIE._TOKEN = data['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None)['error'] raise ExtractorError(error, expected=True) raise diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py index 2878bbd88c..4265feb61f 100644 --- a/yt_dlp/extractor/gdcvault.py +++ b/yt_dlp/extractor/gdcvault.py @@ -2,13 +2,8 @@ from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - HEADRequest, - remove_start, - sanitized_Request, - smuggle_url, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import remove_start, smuggle_url, urlencode_postdata class GDCVaultIE(InfoExtractor): @@ -138,8 +133,8 @@ def _login(self, webpage_url, display_id): 'password': password, } - request = sanitized_Request(login_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(login_url, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') @@ -163,7 +158,7 @@ def _real_extract(self, url): video_url = 'http://www.gdcvault.com' + direct_url # resolve the url so that we can detect the correct extension video_url = self._request_webpage( - HEADRequest(video_url), video_id).geturl() + HEADRequest(video_url), video_id).url return { 'id': video_id, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 87cf11d6bd..8fa4c62217 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2431,7 +2431,7 @@ def _real_extract(self, url): 'Accept-Encoding': 'identity', **smuggled_data.get('http_headers', {}) }) - new_url = full_response.geturl() + new_url = full_response.url url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl() if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) @@ -2529,12 +2529,12 @@ def _real_extract(self, url): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.geturl()), + xspf_base_url=full_response.url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=full_response.url.rpartition('/')[0], mpd_url=url) self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') @@ -2572,7 +2572,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) url, smuggled_data = unsmuggle_url(url, {}) - actual_url = urlh.geturl() if urlh else url + actual_url = urlh.url if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py index a7be2cb766..df98f093c6 100644 --- a/yt_dlp/extractor/globo.py +++ b/yt_dlp/extractor/globo.py @@ -8,8 +8,8 @@ from ..compat import ( compat_str, ) +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, float_or_none, orderedSet, diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 8a4cd1690e..2fdec20f66 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -228,7 +228,7 @@ def add_source_format(urlh): # Using original URLs may result in redirect loop due to # google.com's cookies mistakenly used for googleusercontent.com # redirect URLs (see #23919). - 'url': urlh.geturl(), + 'url': urlh.url, 'ext': determine_ext(title, 'mp4').lower(), 'format_id': 'source', 'quality': 1, diff --git a/yt_dlp/extractor/hketv.py b/yt_dlp/extractor/hketv.py index 10879564fa..e026996da6 100644 --- a/yt_dlp/extractor/hketv.py +++ b/yt_dlp/extractor/hketv.py @@ -126,7 +126,7 @@ def _real_extract(self, url): # If we ever wanted to provide the final resolved URL that # does not require cookies, albeit with a shorter lifespan: # urlh = self._downloader.urlopen(file_url) - # resolved_url = urlh.geturl() + # resolved_url = urlh.url label = fmt.get('label') h = self._FORMAT_HEIGHTS.get(label) w = h * width // height if h and width and height else None diff --git a/yt_dlp/extractor/hotnewhiphop.py b/yt_dlp/extractor/hotnewhiphop.py index f8570cb861..3007fbb530 100644 --- a/yt_dlp/extractor/hotnewhiphop.py +++ b/yt_dlp/extractor/hotnewhiphop.py @@ -1,11 +1,7 @@ from .common import InfoExtractor from ..compat import compat_b64decode -from ..utils import ( - ExtractorError, - HEADRequest, - sanitized_Request, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import ExtractorError, urlencode_postdata class HotNewHipHopIE(InfoExtractor): @@ -36,9 +32,9 @@ def _real_extract(self, url): ('mediaType', 's'), ('mediaId', video_id), ]) - r = sanitized_Request( + r = Request( 'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata) - r.add_header('Content-Type', 'application/x-www-form-urlencoded') + r.headers['Content-Type'] = 'application/x-www-form-urlencoded' mkd = self._download_json( r, video_id, note='Requesting media key', errnote='Could not download media key') @@ -50,7 +46,7 @@ def _real_extract(self, url): req = self._request_webpage( redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL') - video_url = req.geturl() + video_url = req.url if video_url.endswith('.html'): raise ExtractorError('Redirect failed') diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 591e23b8ad..324e9f51db 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -6,7 +6,8 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -233,7 +234,7 @@ def _real_extract(self, url): 'height': int_or_none(playback_set.get('height')), }] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: geo_restricted = True continue diff --git a/yt_dlp/extractor/hrti.py b/yt_dlp/extractor/hrti.py index cfec80d144..57b76e46b4 100644 --- a/yt_dlp/extractor/hrti.py +++ b/yt_dlp/extractor/hrti.py @@ -1,13 +1,13 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, int_or_none, parse_age_limit, - sanitized_Request, try_get, ) @@ -42,7 +42,7 @@ def _initialize_pre_login(self): 'application_version': self._APP_VERSION } - req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req = Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) req.get_method = lambda: 'PUT' resources = self._download_json( @@ -73,8 +73,8 @@ def _perform_login(self, username, password): self._login_url, None, note='Logging in', errnote='Unable to log in', data=json.dumps(auth_data).encode('utf-8')) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: - auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 406: + auth_info = self._parse_json(e.cause.response.read().encode('utf-8'), None) else: raise diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py index e4db7f9fa9..64875f8ceb 100644 --- a/yt_dlp/extractor/ign.py +++ b/yt_dlp/extractor/ign.py @@ -1,8 +1,9 @@ import re -import urllib.error +import urllib.parse from .common import InfoExtractor from ..compat import compat_parse_qs +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -27,9 +28,9 @@ def _checked_call_api(self, slug): try: return self._call_api(slug) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: e.cause.args = e.cause.args or [ - e.cause.geturl(), e.cause.getcode(), e.cause.reason] + e.cause.response.url, e.cause.status, e.cause.reason] raise ExtractorError( 'Content not found: expired?', cause=e.cause, expected=True) @@ -226,7 +227,7 @@ def _real_extract(self, url): parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) webpage, urlh = self._download_webpage_handle(embed_url, video_id) - new_url = urlh.geturl() + new_url = urlh.url ign_url = compat_parse_qs( urllib.parse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: @@ -323,14 +324,14 @@ def _checked_call_api(self, slug): try: return self._call_api(slug) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError): + if isinstance(e.cause, HTTPError): e.cause.args = e.cause.args or [ - e.cause.geturl(), e.cause.getcode(), e.cause.reason] - if e.cause.code == 404: + e.cause.response.url, e.cause.status, e.cause.reason] + if e.cause.status == 404: raise ExtractorError( 'Content not found: expired?', cause=e.cause, expected=True) - elif e.cause.code == 503: + elif e.cause.status == 503: self.report_warning(error_to_compat_str(e.cause)) return raise diff --git a/yt_dlp/extractor/imggaming.py b/yt_dlp/extractor/imggaming.py index 8e220fd9f3..a40aa21763 100644 --- a/yt_dlp/extractor/imggaming.py +++ b/yt_dlp/extractor/imggaming.py @@ -1,7 +1,7 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -52,9 +52,9 @@ def _extract_dve_api_url(self, media_id, media_type): return self._call_api( stream_path, media_id)['playerUrlCallback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: raise ExtractorError( - self._parse_json(e.cause.read().decode(), media_id)['messages'][0], + self._parse_json(e.cause.response.read().decode(), media_id)['messages'][0], expected=True) raise diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 02335138f1..bfc4b7b888 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -3,9 +3,9 @@ import json import re import time -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, decode_base_n, @@ -442,7 +442,7 @@ def _real_extract(self, url): shared_data = self._search_json( r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} - if shared_data and self._LOGIN_URL not in urlh.geturl(): + if shared_data and self._LOGIN_URL not in urlh.url: media.update(traverse_obj( shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) @@ -589,7 +589,7 @@ def _extract_graphql(self, data, url): except ExtractorError as e: # if it's an error caused by a bad query, and there are # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: if gis_tmpl != gis_tmpls[-1]: continue raise diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index e58e9c2ee1..6dec1510da 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -81,7 +81,7 @@ def _perform_login(self, username, password): note='Logging in') # a profile may need to be selected first, even when there is only a single one - if '/profile-select' in login_handle.geturl(): + if '/profile-select' in login_handle.url: profile_id = self._search_regex( r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id') @@ -89,7 +89,7 @@ def _perform_login(self, username, password): f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None, query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile') - code = traverse_obj(login_handle.geturl(), ({parse_qs}, 'code', 0)) + code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0)) if not code: raise ExtractorError('Login failed', expected=True) diff --git a/yt_dlp/extractor/kakao.py b/yt_dlp/extractor/kakao.py index 1f0f0a5d5c..43055e89de 100644 --- a/yt_dlp/extractor/kakao.py +++ b/yt_dlp/extractor/kakao.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -101,8 +101,8 @@ def _real_extract(self, url): cdn_api_base, video_id, query=query, note='Downloading video URL for profile %s' % profile_name) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - resp = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + resp = self._parse_json(e.cause.response.read().decode(), video_id) if resp.get('code') == 'GeoBlocked': self.raise_geo_restricted() raise diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index be1dfd4b16..d124372424 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -1,7 +1,6 @@ from .common import InfoExtractor - +from ..networking import HEADRequest from ..utils import ( - HEADRequest, UserNotLive, float_or_none, merge_dicts, diff --git a/yt_dlp/extractor/kuwo.py b/yt_dlp/extractor/kuwo.py index cfec1c50f6..e8a061a104 100644 --- a/yt_dlp/extractor/kuwo.py +++ b/yt_dlp/extractor/kuwo.py @@ -91,7 +91,7 @@ def _real_extract(self, url): webpage, urlh = self._download_webpage_handle( url, song_id, note='Download song detail info', errnote='Unable to get song detail info') - if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: + if song_id not in urlh.url or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py index 36bfaf5c30..a3cd12b003 100644 --- a/yt_dlp/extractor/la7.py +++ b/yt_dlp/extractor/la7.py @@ -1,13 +1,8 @@ import re from .common import InfoExtractor -from ..utils import ( - float_or_none, - HEADRequest, - int_or_none, - parse_duration, - unified_strdate, -) +from ..networking import HEADRequest +from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate class LA7IE(InfoExtractor): diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 23d3daf13e..6af64f0df4 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -3,9 +3,9 @@ import urllib.parse from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, UnsupportedError, determine_ext, @@ -266,7 +266,7 @@ def _real_extract(self, url): # HEAD request returns redirect response to m3u8 URL if available final_url = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, - note='Downloading streaming redirect url info').geturl() + note='Downloading streaming redirect url info').url elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True diff --git a/yt_dlp/extractor/lecturio.py b/yt_dlp/extractor/lecturio.py index 973764c63f..bb059d3a29 100644 --- a/yt_dlp/extractor/lecturio.py +++ b/yt_dlp/extractor/lecturio.py @@ -25,7 +25,7 @@ def _perform_login(self, username, password): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in url_handle.geturl() + return self._LOGIN_URL not in url_handle.url # Already logged in if is_logged(urlh): diff --git a/yt_dlp/extractor/lego.py b/yt_dlp/extractor/lego.py index 811b447587..46fc7a9b60 100644 --- a/yt_dlp/extractor/lego.py +++ b/yt_dlp/extractor/lego.py @@ -1,7 +1,7 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -75,7 +75,7 @@ def _real_extract(self, url): 'videoId': '%s_%s' % (uuid.UUID(video_id), locale), }, headers=self.geo_verification_headers()) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451: + if isinstance(e.cause, HTTPError) and e.cause.status == 451: self.raise_geo_restricted(countries=countries) raise diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py index e11ec43d66..4e50f106f9 100644 --- a/yt_dlp/extractor/limelight.py +++ b/yt_dlp/extractor/limelight.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, float_or_none, @@ -69,8 +69,8 @@ def _call_playlist_service(self, item_id, method, fatal=True, referer=None): item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), item_id)['detail']['contentAccessPermission'] if error == 'CountryDisabled': self.raise_geo_restricted() raise ExtractorError(error, expected=True) diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py index 7bb64e17c4..0b16442932 100644 --- a/yt_dlp/extractor/linuxacademy.py +++ b/yt_dlp/extractor/linuxacademy.py @@ -2,11 +2,8 @@ import random from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_HTTPError, - compat_str, -) +from ..compat import compat_b64decode, compat_str +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -107,7 +104,7 @@ def random_string(): 'sso': 'true', }) - login_state_url = urlh.geturl() + login_state_url = urlh.url try: login_page = self._download_webpage( @@ -119,8 +116,8 @@ def random_string(): 'Referer': login_state_url, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read(), None) message = error.get('description') or error['code'] raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) @@ -137,7 +134,7 @@ def random_string(): }) access_token = self._search_regex( - r'access_token=([^=&]+)', urlh.geturl(), + r'access_token=([^=&]+)', urlh.url, 'access token', default=None) if not access_token: access_token = self._parse_json( diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index fe549c49fb..7ea78ab691 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -171,7 +171,7 @@ def _real_extract(self, url): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = urlh.geturl() + redirect_url = urlh.url # XXX: might have also extracted UrlReferrer and QueryString from the html service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( diff --git a/yt_dlp/extractor/megatvcom.py b/yt_dlp/extractor/megatvcom.py index 2f3f11f519..93c7e7dc08 100644 --- a/yt_dlp/extractor/megatvcom.py +++ b/yt_dlp/extractor/megatvcom.py @@ -1,14 +1,14 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( + ExtractorError, clean_html, determine_ext, - ExtractorError, extract_attributes, get_element_by_class, get_element_html_by_id, - HEADRequest, parse_qs, unescapeHTML, unified_timestamp, @@ -160,5 +160,5 @@ def _real_extract(self, url): canonical_url = self._request_webpage( HEADRequest(canonical_url), video_id, note='Resolve canonical URL', - errnote='Could not resolve canonical URL').geturl() + errnote='Could not resolve canonical URL').url return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id) diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index 06edcb396a..31ccf004ec 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -1,9 +1,9 @@ import base64 import time -import urllib.error import uuid from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -86,8 +86,8 @@ def _real_extract(self, url): 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None) if error.get('code') == 40005: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(error['msg'], expected=True) diff --git a/yt_dlp/extractor/minds.py b/yt_dlp/extractor/minds.py index 2fb17920cc..27a6e38056 100644 --- a/yt_dlp/extractor/minds.py +++ b/yt_dlp/extractor/minds.py @@ -106,7 +106,7 @@ def _real_extract(self, url): if poster: urlh = self._request_webpage(poster, video_id, fatal=False) if urlh: - thumbnail = urlh.geturl() + thumbnail = urlh.url return { 'id': video_id, diff --git a/yt_dlp/extractor/miomio.py b/yt_dlp/extractor/miomio.py index a0a041ea54..8df8cba191 100644 --- a/yt_dlp/extractor/miomio.py +++ b/yt_dlp/extractor/miomio.py @@ -2,12 +2,8 @@ from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import ( - xpath_text, - int_or_none, - ExtractorError, - sanitized_Request, -) +from ..networking import Request +from ..utils import ExtractorError, int_or_none, xpath_text class MioMioIE(InfoExtractor): @@ -61,7 +57,7 @@ def _extract_mioplayer(self, webpage, video_id, title, http_headers): 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), video_id) - vid_config_request = sanitized_Request( + vid_config_request = Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), headers=http_headers) diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index d91be62700..0d700b9a82 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -2,16 +2,15 @@ from .common import InfoExtractor from ..compat import compat_str +from ..networking import HEADRequest, Request from ..utils import ( ExtractorError, + RegexNotFoundError, find_xpath_attr, fix_xml_ampersands, float_or_none, - HEADRequest, int_or_none, join_nonempty, - RegexNotFoundError, - sanitized_Request, strip_or_none, timeconvert, try_get, @@ -51,15 +50,15 @@ def _get_thumbnail_url(self, uri, itemdoc): def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id - req = sanitized_Request(webpage_url) + req = Request(webpage_url) # Otherwise we get a webpage that would execute some javascript - req.add_header('User-Agent', 'curl/7') + req.headers['User-Agent'] = 'curl/7' webpage = self._download_webpage(req, mtvn_id, 'Downloading mobile page') metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url')) req = HEADRequest(metrics_url) response = self._request_webpage(req, mtvn_id, 'Resolving url') - url = response.geturl() + url = response.url # Transform the url to get the best quality: url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) return [{'url': url, 'ext': 'mp4'}] diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index ddc89a7c29..299b051745 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -6,9 +6,9 @@ from .theplatform import ThePlatformIE, default_ns from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, RegexNotFoundError, UserNotLive, clean_html, diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 7a5a02dfa6..4f3e691b71 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,8 +1,8 @@ import itertools import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' @@ -48,7 +48,7 @@ def inner_call(): return inner_call() except ExtractorError as exc: # if 401 or 403, attempt credential re-auth and retry - if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.status in (401, 403): self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') self._perform_login() return inner_call() diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 595709899a..5b7307bc8f 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -11,6 +11,7 @@ from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding from ..compat import compat_urllib_parse_urlencode +from ..networking import Request from ..utils import ( ExtractorError, bytes_to_intlist, @@ -18,7 +19,6 @@ float_or_none, int_or_none, intlist_to_bytes, - sanitized_Request, try_get, ) @@ -146,8 +146,8 @@ def convert_milliseconds(cls, ms): return int(round(ms / 1000.0)) def query_api(self, endpoint, video_id, note): - req = sanitized_Request('%s%s' % (self._API_BASE, endpoint)) - req.add_header('Referer', self._API_BASE) + req = Request('%s%s' % (self._API_BASE, endpoint)) + req.headers['Referer'] = self._API_BASE return self._download_json(req, video_id, note) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 89e8e60939..fa2d709d28 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -8,10 +8,8 @@ from urllib.parse import urlparse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_HTTPError, -) from ..dependencies import websockets +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -396,7 +394,7 @@ def _real_extract(self, url): webpage, handle = self._download_webpage_handle( 'https://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): - video_id = self._match_id(handle.geturl()) + video_id = self._match_id(handle.url) api_data = self._parse_json(self._html_search_regex( 'data-api-data="([^"]+)"', webpage, @@ -407,9 +405,9 @@ def _real_extract(self, url): 'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id, note='Downloading API JSON', errnote='Unable to fetch data')['data'] except ExtractorError: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise - webpage = e.cause.read().decode('utf-8', 'replace') + webpage = e.cause.response.read().decode('utf-8', 'replace') error_msg = self._html_search_regex( r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>', webpage, 'error reason', default=None) @@ -742,7 +740,7 @@ def _real_extract(self, url): try: mylist = self._call_api(list_id, 'list', {'pageSize': 1}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_login_required('You have to be logged in to get your history') raise return self.playlist_result(self._entries(list_id), list_id, **self._parse_owner(mylist)) @@ -951,8 +949,8 @@ def _real_extract(self, url): 'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9', }) - hostname = remove_start(urlparse(urlh.geturl()).hostname, 'sp.') - cookies = try_get(urlh.geturl(), self._downloader._calc_cookies) + hostname = remove_start(urlparse(urlh.url).hostname, 'sp.') + cookies = try_get(urlh.url, self._downloader._calc_cookies) latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) if latency not in self._KNOWN_LATENCY: latency = 'high' diff --git a/yt_dlp/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py index 7b8a526f02..6078381330 100644 --- a/yt_dlp/extractor/njpwworld.py +++ b/yt_dlp/extractor/njpwworld.py @@ -51,7 +51,7 @@ def _perform_login(self, username, password): data=urlencode_postdata({'login_id': username, 'pw': password}), headers={'Referer': 'https://front.njpwworld.com/auth'}) # /auth/login will return 302 for successful logins - if urlh.geturl() == self._LOGIN_URL: + if urlh.url == self._LOGIN_URL: self.report_warning('unable to login') return False diff --git a/yt_dlp/extractor/nosvideo.py b/yt_dlp/extractor/nosvideo.py index b6d3ea40c1..7e9688c0bc 100644 --- a/yt_dlp/extractor/nosvideo.py +++ b/yt_dlp/extractor/nosvideo.py @@ -1,9 +1,9 @@ import re from .common import InfoExtractor +from ..networking import Request from ..utils import ( ExtractorError, - sanitized_Request, urlencode_postdata, xpath_text, xpath_with_ns, @@ -36,8 +36,8 @@ def _real_extract(self, url): 'op': 'download1', 'method_free': 'Continue to Video', } - req = sanitized_Request(url, urlencode_postdata(fields)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') + req = Request(url, urlencode_postdata(fields)) + req.headers['Content-type'] = 'application/x-www-form-urlencoded' webpage = self._download_webpage(req, video_id, 'Downloading download page') if re.search(self._FILE_DELETED_REGEX, webpage) is not None: diff --git a/yt_dlp/extractor/nowness.py b/yt_dlp/extractor/nowness.py index fc9043bceb..a3c29f62cc 100644 --- a/yt_dlp/extractor/nowness.py +++ b/yt_dlp/extractor/nowness.py @@ -4,10 +4,8 @@ ) from .common import InfoExtractor from ..compat import compat_str -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..networking import Request +from ..utils import ExtractorError class NownessBaseIE(InfoExtractor): @@ -40,7 +38,7 @@ def _extract_url_result(self, post): def _api_request(self, url, request_path): display_id = self._match_id(url) - request = sanitized_Request( + request = Request( 'http://api.nowness.com/api/' + request_path % display_id, headers={ 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index 88d08e5e3a..384865accd 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -3,7 +3,8 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -148,7 +149,7 @@ def call_playback_api(item, query=None): try: return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query) raise diff --git a/yt_dlp/extractor/odkmedia.py b/yt_dlp/extractor/odkmedia.py index 2960860d6c..b852160b9f 100644 --- a/yt_dlp/extractor/odkmedia.py +++ b/yt_dlp/extractor/odkmedia.py @@ -1,7 +1,7 @@ import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, @@ -74,8 +74,8 @@ def _real_extract(self, url): f'https://odkmedia.io/odc/api/v2/playback/{video_info["id"]}/', display_id, headers={'Authorization': '', 'service-name': 'odc'}) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError): - error_data = self._parse_json(e.cause.read(), display_id)['detail'] + if isinstance(e.cause, HTTPError): + error_data = self._parse_json(e.cause.response.read(), display_id)['detail'] raise GeoRestrictedError(error_data) formats, subtitles = [], {} diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index e63714e846..1be45d8adc 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -7,9 +7,9 @@ compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, float_or_none, int_or_none, qualities, @@ -448,7 +448,7 @@ def _extract_mobile(self, url): json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} redirect_url = self._request_webpage(HEADRequest( - json_data['videoSrc']), video_id, 'Requesting download URL').geturl() + json_data['videoSrc']), video_id, 'Requesting download URL').url self._clear_cookies(redirect_url) return { diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index e9d23a4d12..cc3c003fa0 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -2,11 +2,11 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( clean_html, determine_ext, float_or_none, - HEADRequest, InAdvancePagedList, int_or_none, join_nonempty, diff --git a/yt_dlp/extractor/owncloud.py b/yt_dlp/extractor/owncloud.py index e1d5682f87..79fd830bb3 100644 --- a/yt_dlp/extractor/owncloud.py +++ b/yt_dlp/extractor/owncloud.py @@ -44,7 +44,7 @@ def _real_extract(self, url): webpage, urlh = self._download_webpage_handle(url, video_id) if re.search(r'<label[^>]+for="password"', webpage): - webpage = self._verify_video_password(webpage, urlh.geturl(), video_id) + webpage = self._verify_video_password(webpage, urlh.url, video_id) hidden_inputs = self._hidden_inputs(webpage) title = hidden_inputs.get('filename') diff --git a/yt_dlp/extractor/packtpub.py b/yt_dlp/extractor/packtpub.py index 51778d8a20..56203306fb 100644 --- a/yt_dlp/extractor/packtpub.py +++ b/yt_dlp/extractor/packtpub.py @@ -1,10 +1,7 @@ import json from .common import InfoExtractor -from ..compat import ( - # compat_str, - compat_HTTPError, -) +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -54,8 +51,8 @@ def _perform_login(self, username, password): 'password': password, }).encode())['data']['access'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): - message = self._parse_json(e.cause.read().decode(), None)['message'] + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 404): + message = self._parse_json(e.cause.response.read().decode(), None)['message'] raise ExtractorError(message, expected=True) raise @@ -70,7 +67,7 @@ def _real_extract(self, url): 'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id, 'Downloading JSON video', headers=headers)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self.raise_login_required('This video is locked') raise diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index e93e37eb93..447087436d 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,10 +1,10 @@ import itertools -from urllib.error import HTTPError from .common import InfoExtractor from .vimeo import VimeoIE from ..compat import compat_urllib_parse_unquote +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, determine_ext, @@ -37,9 +37,9 @@ def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None item_id, note='Downloading API JSON' if not note else note, query=query, fatal=fatal, headers=headers) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.headers.get('Content-Type')) != 'json': + if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.response.headers.get('Content-Type')) != 'json': raise - err_json = self._parse_json(self._webpage_read_content(e.cause, None, item_id), item_id, fatal=False) + err_json = self._parse_json(self._webpage_read_content(e.cause.response, None, item_id), item_id, fatal=False) err_message = traverse_obj(err_json, ('errors', ..., 'detail'), get_all=False) if err_message: raise ExtractorError(f'Patreon said: {err_message}', expected=True) diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py index 4835822cf5..7864299881 100644 --- a/yt_dlp/extractor/peloton.py +++ b/yt_dlp/extractor/peloton.py @@ -3,7 +3,7 @@ import urllib.parse from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -83,8 +83,8 @@ def _login(self, video_id): }).encode(), headers={'Content-Type': 'application/json', 'User-Agent': 'web'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - json_string = self._webpage_read_content(e.cause, None, video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + json_string = self._webpage_read_content(e.cause.response, None, video_id) res = self._parse_json(json_string, video_id) raise ExtractorError(res['message'], expected=res['message'] == 'Login failed') else: @@ -96,8 +96,8 @@ def _get_token(self, video_id): 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token', data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_string = self._webpage_read_content(e.cause, None, video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + json_string = self._webpage_read_content(e.cause.response, None, video_id) res = self._parse_json(json_string, video_id) raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached') else: @@ -109,7 +109,7 @@ def _real_extract(self, url): try: self._start_session(video_id) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self._login(video_id) self._start_session(video_id) else: diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index eb5923d110..5f39e06396 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -69,7 +69,7 @@ def _perform_login(self, username, password): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = compat_urlparse.urlparse(urlh.url) if parts.path != '/': login_ok = False if not login_ok: diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py index dcf18e1f3b..00500686fe 100644 --- a/yt_dlp/extractor/pladform.py +++ b/yt_dlp/extractor/pladform.py @@ -78,7 +78,7 @@ def fail(text): expected=True) if not video: - targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl() + targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').url if targetUrl == url: raise ExtractorError('Can\'t parse page') return self.url_result(targetUrl) diff --git a/yt_dlp/extractor/platzi.py b/yt_dlp/extractor/platzi.py index b8a4414940..166b98c4a2 100644 --- a/yt_dlp/extractor/platzi.py +++ b/yt_dlp/extractor/platzi.py @@ -36,7 +36,7 @@ def _perform_login(self, username, password): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in urlh.geturl(): + if 'platzi.com/login' not in urlh.url: return login_error = self._webpage_read_content( diff --git a/yt_dlp/extractor/playplustv.py b/yt_dlp/extractor/playplustv.py index 316f220f79..a4439c8bc5 100644 --- a/yt_dlp/extractor/playplustv.py +++ b/yt_dlp/extractor/playplustv.py @@ -1,13 +1,9 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - PUTRequest, -) +from ..networking import PUTRequest +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, clean_html, int_or_none class PlayPlusTVIE(InfoExtractor): @@ -47,9 +43,9 @@ def _perform_login(self, username, password): try: self._token = self._download_json(req, None)['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError(self._parse_json( - e.cause.read(), None)['errorMessage'], expected=True) + e.cause.response.read(), None)['errorMessage'], expected=True) raise self._profile = self._call_api('Profiles')['list'][0]['_id'] diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 2f5a572a5b..f08414030b 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -3,11 +3,12 @@ import math import operator import re -import urllib.request from .common import InfoExtractor from .openload import PhantomJSwrapper -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( NO_DEFAULT, ExtractorError, @@ -46,8 +47,8 @@ def dl(*args, **kwargs): r'document\.cookie\s*=\s*["\']RNKEY=', r'document\.location\.reload\(true\)')): url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, urllib.request.Request) + url = (url_or_request.url + if isinstance(url_or_request, Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) @@ -602,7 +603,7 @@ def download_page(base_url, num, fallback=False): base_url, item_id, note, query={'page': num}) def is_404(e): - return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + return isinstance(e.cause, HTTPError) and e.cause.status == 404 base_url = url has_page = page is not None diff --git a/yt_dlp/extractor/puhutv.py b/yt_dlp/extractor/puhutv.py index 482e5705f0..4b8e5e90de 100644 --- a/yt_dlp/extractor/puhutv.py +++ b/yt_dlp/extractor/puhutv.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -72,7 +70,7 @@ def _real_extract(self, url): display_id, 'Downloading video JSON', headers=self.geo_verification_headers()) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self.raise_geo_restricted() raise diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index 7fdf782831..cef68eba08 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -41,7 +41,7 @@ def _auth_client(self): 'x-radiko-device': 'pc', 'x-radiko-user': 'dummy_user', }) - auth1_header = auth1_handle.info() + auth1_header = auth1_handle.headers auth_token = auth1_header['X-Radiko-AuthToken'] kl = int(auth1_header['X-Radiko-KeyLength']) diff --git a/yt_dlp/extractor/radiocanada.py b/yt_dlp/extractor/radiocanada.py index 72c21d5022..1a5a6355a6 100644 --- a/yt_dlp/extractor/radiocanada.py +++ b/yt_dlp/extractor/radiocanada.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -74,8 +74,8 @@ def _call_api(self, path, video_id=None, app_code=None, query=None): return self._download_json( 'https://services.radio-canada.ca/media/' + path, video_id, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): - data = self._parse_json(e.cause.read().decode(), None) + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 422): + data = self._parse_json(e.cause.response.read().decode(), None) error = data.get('error_description') or data['errorMessage']['text'] raise ExtractorError(error, expected=True) raise diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py index 2440858ca1..028d3d90bb 100644 --- a/yt_dlp/extractor/rcs.py +++ b/yt_dlp/extractor/rcs.py @@ -1,9 +1,9 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, base_url, clean_html, extract_attributes, diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py index 27b4ad7bbc..79d9c8e31e 100644 --- a/yt_dlp/extractor/rcti.py +++ b/yt_dlp/extractor/rcti.py @@ -3,7 +3,7 @@ import time from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( dict_get, ExtractorError, @@ -186,7 +186,7 @@ def _real_extract(self, url): try: formats = self._extract_m3u8_formats(video_url, display_id, 'mp4', headers={'Referer': 'https://www.rctiplus.com/'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self.raise_geo_restricted(countries=['ID'], metadata_available=True) else: raise e diff --git a/yt_dlp/extractor/recurbate.py b/yt_dlp/extractor/recurbate.py index 5534cf3c35..d7294cb143 100644 --- a/yt_dlp/extractor/recurbate.py +++ b/yt_dlp/extractor/recurbate.py @@ -1,6 +1,5 @@ -import urllib.error - from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ExtractorError, merge_dicts @@ -25,7 +24,7 @@ def _real_extract(self, url): try: webpage = self._download_webpage(url, video_id) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') raise token = self._html_search_regex(r'data-token="([^"]+)"', webpage, 'token') diff --git a/yt_dlp/extractor/redbulltv.py b/yt_dlp/extractor/redbulltv.py index a01bc8434c..d1de2490fc 100644 --- a/yt_dlp/extractor/redbulltv.py +++ b/yt_dlp/extractor/redbulltv.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( float_or_none, ExtractorError, @@ -68,9 +68,9 @@ def extract_info(self, video_id): headers={'Authorization': token} ) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: error_message = self._parse_json( - e.cause.read().decode(), video_id)['error'] + e.cause.response.read().decode(), video_id)['error'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) raise diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index 098fb81857..f9453202b7 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -1,8 +1,8 @@ import functools -import urllib from .common import InfoExtractor from ..compat import compat_parse_qs +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -82,7 +82,7 @@ def _call_api(self, ep, video_id, *args, **kwargs): f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs) break except ExtractorError as e: - if first_attempt and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if first_attempt and isinstance(e.cause, HTTPError) and e.cause.status == 401: del self._API_HEADERS['authorization'] # refresh the token continue raise diff --git a/yt_dlp/extractor/regiotv.py b/yt_dlp/extractor/regiotv.py index 6114841fb2..edb6ae5bce 100644 --- a/yt_dlp/extractor/regiotv.py +++ b/yt_dlp/extractor/regiotv.py @@ -1,10 +1,6 @@ from .common import InfoExtractor - -from ..utils import ( - sanitized_Request, - xpath_text, - xpath_with_ns, -) +from ..networking import Request +from ..utils import xpath_text, xpath_with_ns class RegioTVIE(InfoExtractor): @@ -33,7 +29,7 @@ def _real_extract(self, url): SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>' - request = sanitized_Request( + request = Request( 'http://v.telvi.de/', SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) video_data = self._download_xml(request, video_id, 'Downloading video XML') diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index 0e40eb32a3..4a4d40befd 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -245,7 +245,7 @@ def _perform_login(self, username, password): f'{self._AUTH_BASE}/token', None, note='getting access credentials', errnote='error getting access credentials', data=urlencode_postdata({ - 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.geturl()).fragment).get('code')[0], + 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.url).fragment).get('code')[0], 'client_id': 'web', 'grant_type': 'authorization_code', 'redirect_uri': 'https://rokfin.com/silent-check-sso.html' @@ -269,7 +269,7 @@ def _download_json_using_access_token(self, url_or_request, video_id, headers={} json_string, urlh = self._download_webpage_handle( url_or_request, video_id, headers=headers, query=query, expected_status=401) - if not auth_token or urlh.code != 401 or refresh_token is None: + if not auth_token or urlh.status != 401 or refresh_token is None: return self._parse_json(json_string, video_id) self._access_mgmt_tokens = self._download_json( diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 776fbfbc08..94e673b133 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -35,8 +35,8 @@ def _perform_login(self, username, password): })) except ExtractorError as e: msg = 'Unable to login' - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + resp = self._parse_json(e.cause.response.read().decode(), None, fatal=False) if resp: error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') if error: @@ -138,8 +138,8 @@ def _real_extract(self, url): m3u8_url = video_data['attributes']['url'] # XXX: additional URL at video_data['links']['download'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + if self._parse_json(e.cause.response.read().decode(), display_id).get('access') is False: self.raise_login_required( '%s is only available for FIRST members' % display_id) raise diff --git a/yt_dlp/extractor/rozhlas.py b/yt_dlp/extractor/rozhlas.py index 5f83d42e83..63134322dc 100644 --- a/yt_dlp/extractor/rozhlas.py +++ b/yt_dlp/extractor/rozhlas.py @@ -1,7 +1,7 @@ import itertools -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, extract_attributes, @@ -81,7 +81,7 @@ def _extract_formats(self, entry, audio_id): 'vcodec': 'none', }) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 429: + if isinstance(e.cause, HTTPError) and e.cause.status == 429: retry.error = e.cause else: self.report_warning(e.msg) diff --git a/yt_dlp/extractor/rte.py b/yt_dlp/extractor/rte.py index aedaa5b550..7ba80d4ba7 100644 --- a/yt_dlp/extractor/rte.py +++ b/yt_dlp/extractor/rte.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( float_or_none, parse_iso8601, @@ -31,8 +31,8 @@ def _real_extract(self, url): except ExtractorError as ee: if num < len(ENDPOINTS) or formats: continue - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if isinstance(ee.cause, HTTPError) and ee.cause.status == 404: + error_info = self._parse_json(ee.cause.response.read().decode(), item_id, fatal=False) if error_info: raise ExtractorError( '%s said: %s' % (self.IE_NAME, error_info['message']), diff --git a/yt_dlp/extractor/rts.py b/yt_dlp/extractor/rts.py index 81c4d7cac8..9f73d1811f 100644 --- a/yt_dlp/extractor/rts.py +++ b/yt_dlp/extractor/rts.py @@ -136,8 +136,8 @@ def download_json(internal_id): if not entries: page, urlh = self._download_webpage_handle(url, display_id) - if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: - return self.url_result(urlh.geturl(), 'RTS') + if re.match(self._VALID_URL, urlh.url).group('id') != media_id: + return self.url_result(urlh.url, 'RTS') # article with videos on rhs videos = re.findall( diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 82f3f0f8c2..f8bf4a1825 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -2,7 +2,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, UnsupportedError, @@ -371,7 +371,7 @@ def entries(self, url, playlist_id): try: webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage): diff --git a/yt_dlp/extractor/safari.py b/yt_dlp/extractor/safari.py index 450a661e9f..8d322d7105 100644 --- a/yt_dlp/extractor/safari.py +++ b/yt_dlp/extractor/safari.py @@ -28,13 +28,13 @@ def _perform_login(self, username, password): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in urlh.geturl() + return 'learning.oreilly.com/home/' in urlh.url if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = urlh.geturl() + redirect_url = urlh.url parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( @@ -129,7 +129,7 @@ def _real_extract(self, url): webpage, urlh = self._download_webpage_handle(url, video_id) - mobj = re.match(self._VALID_URL, urlh.geturl()) + mobj = re.match(self._VALID_URL, urlh.url) reference_id = mobj.group('reference_id') if not reference_id: reference_id = self._search_regex( diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 119106e8ef..7a91150475 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -1,6 +1,6 @@ from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, float_or_none, int_or_none, parse_duration, diff --git a/yt_dlp/extractor/sevenplus.py b/yt_dlp/extractor/sevenplus.py index 222bf6ce7a..6c688d1505 100644 --- a/yt_dlp/extractor/sevenplus.py +++ b/yt_dlp/extractor/sevenplus.py @@ -2,10 +2,8 @@ import re from .brightcove import BrightcoveNewBaseIE -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, try_get, @@ -97,9 +95,9 @@ def _real_extract(self, url): 'videoType': 'vod', }, headers=headers)['media'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: raise ExtractorError(self._parse_json( - e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + e.cause.response.read().decode(), episode_id)[0]['error_code'], expected=True) raise for source in media.get('sources', {}): diff --git a/yt_dlp/extractor/shahid.py b/yt_dlp/extractor/shahid.py index 26a0bff400..d509e8879c 100644 --- a/yt_dlp/extractor/shahid.py +++ b/yt_dlp/extractor/shahid.py @@ -3,7 +3,7 @@ import re from .aws import AWSIE -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -22,7 +22,7 @@ class ShahidBaseIE(AWSIE): def _handle_error(self, e): fail_data = self._parse_json( - e.cause.read().decode('utf-8'), None, fatal=False) + e.cause.response.read().decode('utf-8'), None, fatal=False) if fail_data: faults = fail_data.get('faults', []) faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) @@ -40,7 +40,7 @@ def _call_api(self, path, video_id, request=None): 'secret_key': '4WUUJWuFvtTkXbhaWTDv7MhO+0LqoYDWfEnUXoWn', }, video_id, query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, HTTPError): self._handle_error(e) raise @@ -88,7 +88,7 @@ def _perform_login(self, username, password): 'Content-Type': 'application/json; charset=UTF-8', })['user'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, HTTPError): self._handle_error(e) raise diff --git a/yt_dlp/extractor/sina.py b/yt_dlp/extractor/sina.py index aeba4e3771..9842811888 100644 --- a/yt_dlp/extractor/sina.py +++ b/yt_dlp/extractor/sina.py @@ -1,12 +1,12 @@ from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, - int_or_none, - update_url_query, - qualities, - get_element_by_attribute, clean_html, + get_element_by_attribute, + int_or_none, + qualities, + update_url_query, ) @@ -60,7 +60,7 @@ def _real_extract(self, url): self.to_screen('Getting video id') request = HEADRequest(url) _, urlh = self._download_webpage_handle(request, 'NA', False) - return self._real_extract(urlh.geturl()) + return self._real_extract(urlh.url) else: pseudo_id = mobj.group('pseudo_id') webpage = self._download_webpage(url, pseudo_id) diff --git a/yt_dlp/extractor/sixplay.py b/yt_dlp/extractor/sixplay.py index a6fb6c1f5b..ef93b92768 100644 --- a/yt_dlp/extractor/sixplay.py +++ b/yt_dlp/extractor/sixplay.py @@ -79,7 +79,7 @@ def _real_extract(self, url): headers=self.geo_verification_headers()) if not urlh: continue - asset_url = urlh.geturl() + asset_url = urlh.url asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') for i in range(3, 0, -1): asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 3d36edbbc3..25f867a601 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -426,7 +426,7 @@ def _real_extract(self, url): video_id, headers=traverse_obj(parse_qs(url), { 'Referer': ('embed_parent_url', -1), 'Origin': ('embed_container_origin', -1)})) - redirect_url = urlh.geturl() + redirect_url = urlh.url if 'domain_not_allowed' in redirect_url: domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False) if not domain: diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index 5ebe20df7a..4379572592 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -6,7 +6,7 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -123,12 +123,12 @@ def _call_api(self, version, path, video_id): 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), video_id, headers=self._HEADERS)['resultObj'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406 and self._parse_json( - e.cause.read().decode(), video_id)['message'] == 'Please subscribe to watch this content': + if isinstance(e.cause, HTTPError) and e.cause.status == 406 and self._parse_json( + e.cause.response.read().decode(), video_id)['message'] == 'Please subscribe to watch this content': self.raise_login_required(self._LOGIN_HINT, method=None) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: message = self._parse_json( - e.cause.read().decode(), video_id)['message'] + e.cause.response.read().decode(), video_id)['message'] if message == 'Geoblocked Country': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(message) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 979f23f44f..a7c2afd497 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -7,15 +7,13 @@ InfoExtractor, SearchInfoExtractor ) -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking import HEADRequest, Request +from ..networking.exceptions import HTTPError from ..utils import ( error_to_compat_str, ExtractorError, float_or_none, - HEADRequest, int_or_none, KNOWN_EXTENSIONS, mimetype2ext, @@ -26,7 +24,6 @@ update_url_query, url_or_none, urlhandle_detect_ext, - sanitized_Request, ) @@ -103,7 +100,7 @@ def _download_json(self, *args, **kwargs): try: return super()._download_json(*args, **kwargs) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): self._store_client_id(None) self._update_client_id() continue @@ -123,7 +120,7 @@ def _perform_login(self, username, password): self._access_token = password query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID payload = {'session': {'access_token': self._access_token}} - token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) + token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) if response is not False: self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} @@ -212,7 +209,7 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f urlh = self._request_webpage( HEADRequest(redirect_url), track_id, fatal=False) if urlh: - format_url = urlh.geturl() + format_url = urlh.url format_urls.add(format_url) formats.append({ 'format_id': 'download', @@ -669,7 +666,7 @@ def _entries(self, url, playlist_id): except ExtractorError as e: # Downloading page may result in intermittent 502 HTTP error # See https://github.com/yt-dlp/yt-dlp/issues/872 - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502: + if not isinstance(e.cause, HTTPError) or e.cause.status != 502: raise retry.error = e continue diff --git a/yt_dlp/extractor/teachable.py b/yt_dlp/extractor/teachable.py index c212a4926e..01906bda9d 100644 --- a/yt_dlp/extractor/teachable.py +++ b/yt_dlp/extractor/teachable.py @@ -56,7 +56,7 @@ def is_logged(webpage): self._logged_in = True return - login_url = urlh.geturl() + login_url = urlh.url login_form = self._hidden_inputs(login_page) diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py index 88f29cb83c..54e74a6c02 100644 --- a/yt_dlp/extractor/telemundo.py +++ b/yt_dlp/extractor/telemundo.py @@ -1,9 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - try_get, - unified_timestamp, - HEADRequest, -) +from ..networking import HEADRequest +from ..utils import try_get, unified_timestamp class TelemundoIE(InfoExtractor): @@ -38,7 +35,7 @@ def _real_extract(self, url): m3u8_url = self._request_webpage(HEADRequest( redirect_url + '?format=redirect&manifest=m3u&format=redirect&Tracking=true&Embedded=true&formats=MPEG4'), - video_id, 'Processing m3u8').geturl() + video_id, 'Processing m3u8').url formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') date = unified_timestamp(try_get( metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['datePublished'].split(' ', 1)[1])) diff --git a/yt_dlp/extractor/tennistv.py b/yt_dlp/extractor/tennistv.py index bc64226bf0..c1b4a33124 100644 --- a/yt_dlp/extractor/tennistv.py +++ b/yt_dlp/extractor/tennistv.py @@ -86,7 +86,7 @@ def _perform_login(self, username, password): }) self.get_token(None, { - 'code': urllib.parse.parse_qs(handle.geturl())['code'][-1], + 'code': urllib.parse.parse_qs(handle.url)['code'][-1], 'grant_type': 'authorization_code', 'client_id': 'tennis-tv-web', 'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html' diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index 633032e310..c7097cf025 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -2,11 +2,8 @@ import base64 from .common import InfoExtractor -from ..utils import ( - HEADRequest, - int_or_none, - urlencode_postdata, -) +from ..networking import HEADRequest +from ..utils import int_or_none, urlencode_postdata class TenPlayIE(InfoExtractor): @@ -94,7 +91,7 @@ def _real_extract(self, url): data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', headers=headers).get('source') m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).geturl() + _video_url), content_id).url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') diff --git a/yt_dlp/extractor/tfo.py b/yt_dlp/extractor/tfo.py index a24789cb37..d417f50e10 100644 --- a/yt_dlp/extractor/tfo.py +++ b/yt_dlp/extractor/tfo.py @@ -1,12 +1,8 @@ import json from .common import InfoExtractor -from ..utils import ( - HEADRequest, - ExtractorError, - int_or_none, - clean_html, -) +from ..networking import HEADRequest +from ..utils import ExtractorError, clean_html, int_or_none class TFOIE(InfoExtractor): diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index e659b8ee10..537f6f6cd0 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -7,13 +7,13 @@ from .once import OnceIE from .adobepass import AdobePassIE +from ..networking import Request from ..utils import ( determine_ext, ExtractorError, float_or_none, int_or_none, parse_qs, - sanitized_Request, unsmuggle_url, update_url_query, xpath_with_ns, @@ -270,7 +270,7 @@ def _real_extract(self, url): source_url = smuggled_data.get('source_url') if source_url: headers['Referer'] = source_url - request = sanitized_Request(url, headers=headers) + request = Request(url, headers=headers) webpage = self._download_webpage(request, video_id) smil_url = self._search_regex( r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml', diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py index 55b6413ae6..cc7beeea52 100644 --- a/yt_dlp/extractor/thisoldhouse.py +++ b/yt_dlp/extractor/thisoldhouse.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import HEADRequest +from ..networking import HEADRequest class ThisOldHouseIE(InfoExtractor): @@ -50,6 +50,6 @@ def _real_extract(self, url): r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', webpage, 'video url') if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage: - return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).geturl(), 'Zype', display_id) + return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).url, 'Zype', display_id) video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id') return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py index b1041902bf..7841f8da69 100644 --- a/yt_dlp/extractor/threeqsdn.py +++ b/yt_dlp/extractor/threeqsdn.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -90,7 +90,7 @@ def _real_extract(self, url): config = self._download_json( url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_geo_restricted() raise diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 2f491c3170..48de61f934 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -7,9 +7,9 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, LazyList, UnsupportedError, UserNotLive, @@ -1084,7 +1084,7 @@ class TikTokVMIE(InfoExtractor): def _real_extract(self, url): new_url = self._request_webpage( - HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl() + HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url if self.suitable(new_url): # Prevent infinite loop in case redirect fails raise UnsupportedError(new_url) return self.url_result(new_url) diff --git a/yt_dlp/extractor/toutv.py b/yt_dlp/extractor/toutv.py index f60c199f0e..ced1224fad 100644 --- a/yt_dlp/extractor/toutv.py +++ b/yt_dlp/extractor/toutv.py @@ -1,7 +1,7 @@ import json from .radiocanada import RadioCanadaIE -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -52,8 +52,8 @@ def _perform_login(self, username, password): 'Content-Type': 'application/json;charset=utf-8', })['access_token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['Message'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None)['Message'] raise ExtractorError(error, expected=True) raise self._claims = self._call_api('validation/v2/getClaims')['claims'] diff --git a/yt_dlp/extractor/triller.py b/yt_dlp/extractor/triller.py index 6a4dadb9bd..c5d01c8271 100644 --- a/yt_dlp/extractor/triller.py +++ b/yt_dlp/extractor/triller.py @@ -3,9 +3,9 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, UnsupportedError, determine_ext, int_or_none, @@ -327,7 +327,7 @@ class TrillerShortIE(InfoExtractor): }] def _real_extract(self, url): - real_url = self._request_webpage(HEADRequest(url), self._match_id(url)).geturl() + real_url = self._request_webpage(HEADRequest(url), self._match_id(url)).url if self.suitable(real_url): # Prevent infinite loop in case redirect fails raise UnsupportedError(real_url) return self.url_result(real_url) diff --git a/yt_dlp/extractor/trueid.py b/yt_dlp/extractor/trueid.py index 696343627b..86f0990e83 100644 --- a/yt_dlp/extractor/trueid.py +++ b/yt_dlp/extractor/trueid.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -88,9 +88,9 @@ def _real_extract(self, url): stream_data = self._download_json( f'https://{domain}/cmsPostProxy/contents/video/{video_id}/streamer?os=android', video_id, data=b'')['data'] except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise e - errmsg = self._parse_json(e.cause.read().decode(), video_id)['meta']['message'] + errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['meta']['message'] if 'country' in errmsg: self.raise_geo_restricted( errmsg, [initial_data['display_country']] if initial_data.get('display_country') else None, True) diff --git a/yt_dlp/extractor/tubetugraz.py b/yt_dlp/extractor/tubetugraz.py index 2199fea19a..a351e4e550 100644 --- a/yt_dlp/extractor/tubetugraz.py +++ b/yt_dlp/extractor/tubetugraz.py @@ -22,7 +22,7 @@ def _perform_login(self, username, password): return content, urlh = self._download_webpage_handle( - urlh.geturl(), None, fatal=False, headers={'referer': urlh.geturl()}, + urlh.url, None, fatal=False, headers={'referer': urlh.url}, note='logging in', errnote='unable to log in', data=urlencode_postdata({ 'lang': 'de', @@ -30,7 +30,7 @@ def _perform_login(self, username, password): 'j_username': username, 'j_password': password })) - if not urlh or urlh.geturl() == 'https://tube.tugraz.at/paella/ui/index.html': + if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html': return if not self._html_search_regex( @@ -40,14 +40,14 @@ def _perform_login(self, username, password): return content, urlh = self._download_webpage_handle( - urlh.geturl(), None, fatal=False, headers={'referer': urlh.geturl()}, + urlh.url, None, fatal=False, headers={'referer': urlh.url}, note='logging in with TFA', errnote='unable to log in with TFA', data=urlencode_postdata({ 'lang': 'de', '_eventId_proceed': '', 'j_tokenNumber': self._get_tfa_info(), })) - if not urlh or urlh.geturl() == 'https://tube.tugraz.at/paella/ui/index.html': + if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html': return self.report_warning('unable to login: incorrect TFA code') diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index de8b5da697..bd46bc3630 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -1,13 +1,13 @@ import re from .common import InfoExtractor +from ..networking import Request from ..utils import ( ExtractorError, int_or_none, js_to_json, - sanitized_Request, - urlencode_postdata, traverse_obj, + urlencode_postdata, ) @@ -72,8 +72,8 @@ def _perform_login(self, username, password): 'password': password, } payload = urlencode_postdata(form_data) - request = sanitized_Request(self._LOGIN_URL, payload) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(self._LOGIN_URL, payload) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' login_page = self._download_webpage( request, None, False, 'Wrong login info') if not re.search(r'id="tubi-logout"', login_page): diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py index 88d4ae32de..a26bdcaae7 100644 --- a/yt_dlp/extractor/tumblr.py +++ b/yt_dlp/extractor/tumblr.py @@ -274,7 +274,7 @@ def _real_extract(self, url): url = f'http://{blog}.tumblr.com/post/{video_id}/' webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = urlh.geturl() + redirect_url = urlh.url api_only = bool(self._search_regex( r'(tumblr.com|^)/(safe-mode|login_required|blog/view)', diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index e02121bd8b..fd2fe132c1 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -225,10 +225,10 @@ def _real_extract(self, url): urlh = self._request_webpage( url, redirect_id, note='Downloading redirect page') - url = urlh.geturl() + url = urlh.url url_parsed = urllib.parse.urlparse(url) if url_parsed.port == 443: - url = url_parsed._replace(netloc=url_parsed.hostname).geturl() + url = url_parsed._replace(netloc=url_parsed.hostname).url self.to_screen('Following redirect: %s' % url) return self.url_result(url) diff --git a/yt_dlp/extractor/tv2.py b/yt_dlp/extractor/tv2.py index c51e633712..f6b452dc80 100644 --- a/yt_dlp/extractor/tv2.py +++ b/yt_dlp/extractor/tv2.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -57,8 +57,8 @@ def _real_extract(self, url): headers={'content-type': 'application/json'}, data='{"device":{"id":"1-1-1","name":"Nettleser (HTML)"}}'.encode())['playback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), video_id)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), video_id)['error'] error_code = error.get('code') if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -211,8 +211,8 @@ def _real_extract(self, url): api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol, video_id, 'Downloading play JSON')['playback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), video_id)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), video_id)['error'] error_code = error.get('code') if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index c686044fa2..2aa0dd870a 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -488,9 +488,9 @@ def _call_api(self, resource, video_id, query={}, **kwargs): f'{self._API_BASE_URL}/{resource}', video_id, query={'lang': 'pl', 'platform': 'BROWSER', **query}, expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs) - if is_valid(urlh.getcode()): + if is_valid(urlh.status): return document - raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.getcode()})') + raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})') def _parse_video(self, video, with_url=True): info_dict = traverse_obj(video, { diff --git a/yt_dlp/extractor/tvplay.py b/yt_dlp/extractor/tvplay.py index e056f9872c..48a6efe1cc 100644 --- a/yt_dlp/extractor/tvplay.py +++ b/yt_dlp/extractor/tvplay.py @@ -1,10 +1,8 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urlparse, -) +from ..compat import compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -129,8 +127,8 @@ def _real_extract(self, url): 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + msg = self._parse_json(e.cause.response.read().decode('utf-8'), video_id) raise ExtractorError(msg['msg'], expected=True) raise diff --git a/yt_dlp/extractor/tvplayer.py b/yt_dlp/extractor/tvplayer.py index b05355f876..228c2366ed 100644 --- a/yt_dlp/extractor/tvplayer.py +++ b/yt_dlp/extractor/tvplayer.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( extract_attributes, try_get, @@ -64,9 +62,9 @@ def _real_extract(self, url): 'validate': validate, }))['tvplayer']['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, HTTPError): response = self._parse_json( - e.cause.read().decode(), resource_id)['tvplayer']['response'] + e.cause.response.read().decode(), resource_id)['tvplayer']['response'] raise ExtractorError( '%s said: %s' % (self.IE_NAME, response['error']), expected=True) raise diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 2548dae047..dff353a4f9 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -107,9 +107,9 @@ def _real_extract(self, url): url, video_id, data=request_data, headers={'Origin': 'https://twitcasting.tv'}, note='Trying video password') - if urlh.geturl() != url and request_data: + if urlh.url != url and request_data: webpage = self._download_webpage( - urlh.geturl(), video_id, data=request_data, + urlh.url, video_id, data=request_data, headers={'Origin': 'https://twitcasting.tv'}, note='Retrying authentication') # has to check here as the first request can contain password input form even if the password is correct diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index c8ee520144..3297ef0917 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -71,7 +71,7 @@ def login_step(page, urlh, note, data): form = self._hidden_inputs(page) form.update(data) - page_url = urlh.geturl() + page_url = urlh.url post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, 'post url', default=self._LOGIN_POST_URL, group='url') diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index fc157ac228..4015277a86 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1596,7 +1596,7 @@ def _real_extract(self, url): if eid: id = eid url = self._BASE_URL + id - new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).geturl() + new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).url __UNSAFE_LINK = "https://twitter.com/safety/unsafe_link_warning?unsafe_link=" if new_url.startswith(__UNSAFE_LINK): new_url = new_url.replace(__UNSAFE_LINK, "") diff --git a/yt_dlp/extractor/udemy.py b/yt_dlp/extractor/udemy.py index 329e5da2d9..5c296051af 100644 --- a/yt_dlp/extractor/udemy.py +++ b/yt_dlp/extractor/udemy.py @@ -1,8 +1,9 @@ import re -import urllib.request from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str, compat_urlparse +from ..compat import compat_str, compat_urlparse +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -10,7 +11,6 @@ float_or_none, int_or_none, js_to_json, - sanitized_Request, smuggle_url, try_get, unescapeHTML, @@ -153,11 +153,10 @@ def _download_json(self, url_or_request, *args, **kwargs): headers['X-Udemy-Bearer-Token'] = cookie.value headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value - if isinstance(url_or_request, urllib.request.Request): - for header, value in headers.items(): - url_or_request.add_header(header, value) + if isinstance(url_or_request, Request): + url_or_request.headers.update(headers) else: - url_or_request = sanitized_Request(url_or_request, headers=headers) + url_or_request = Request(url_or_request, headers=headers) response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) self._handle_error(response) @@ -212,7 +211,7 @@ def _real_extract(self, url): lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: webpage = webpage or self._download_webpage(url, lecture_id) self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) diff --git a/yt_dlp/extractor/vevo.py b/yt_dlp/extractor/vevo.py index da4ce49ca6..aa40227a76 100644 --- a/yt_dlp/extractor/vevo.py +++ b/yt_dlp/extractor/vevo.py @@ -2,10 +2,8 @@ import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -184,8 +182,8 @@ def _call_api(self, path, *args, **kwargs): try: data = self._download_json(self._api_url_template % path, *args, **kwargs) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errors = self._parse_json(e.cause.read().decode(), None)['errors'] + if isinstance(e.cause, HTTPError): + errors = self._parse_json(e.cause.response.read().decode(), None)['errors'] error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) raise diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py index d1a3b48aac..8a71268539 100644 --- a/yt_dlp/extractor/vice.py +++ b/yt_dlp/extractor/vice.py @@ -7,10 +7,8 @@ from .adobepass import AdobePassIE from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -140,8 +138,8 @@ def _real_extract(self, url): 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), video_id, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): - error = json.loads(e.cause.read().decode()) + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401): + error = json.loads(e.cause.response.read().decode()) error_message = error.get('error_description') or error['details'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) diff --git a/yt_dlp/extractor/videocampus_sachsen.py b/yt_dlp/extractor/videocampus_sachsen.py index 982ab3dd08..37bc7d7181 100644 --- a/yt_dlp/extractor/videocampus_sachsen.py +++ b/yt_dlp/extractor/videocampus_sachsen.py @@ -2,7 +2,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ExtractorError, OnDemandPagedList, urlencode_postdata @@ -169,7 +169,7 @@ def _real_extract(self, url): f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', video_id, 'mp4', m3u8_id='hls', fatal=True) except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (404, 500): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (404, 500): raise formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'}) diff --git a/yt_dlp/extractor/vidio.py b/yt_dlp/extractor/vidio.py index 23e1aaf202..770aa284da 100644 --- a/yt_dlp/extractor/vidio.py +++ b/yt_dlp/extractor/vidio.py @@ -39,7 +39,7 @@ def is_logged_in(): login_post, login_post_urlh = self._download_webpage_handle( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), expected_status=[302, 401]) - if login_post_urlh.getcode() == 401: + if login_post_urlh.status == 401: if get_element_by_class('onboarding-content-register-popup__title', login_post): raise ExtractorError( 'Unable to log in: The provided email has not registered yet.', expected=True) diff --git a/yt_dlp/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py index cde4274d9c..44353b7fc4 100644 --- a/yt_dlp/extractor/vidlii.py +++ b/yt_dlp/extractor/vidlii.py @@ -1,8 +1,8 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, format_field, float_or_none, get_element_by_id, diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py index 3812601148..8f686f05db 100644 --- a/yt_dlp/extractor/viewlift.py +++ b/yt_dlp/extractor/viewlift.py @@ -1,7 +1,7 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -46,8 +46,8 @@ def _call_api(self, site, path, video_id, url, query): return self._download_json( self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - webpage = e.cause.read().decode() + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + webpage = e.cause.response.read().decode() try: error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message') except json.JSONDecodeError: diff --git a/yt_dlp/extractor/viidea.py b/yt_dlp/extractor/viidea.py index 4cdf2677b2..649ffe395b 100644 --- a/yt_dlp/extractor/viidea.py +++ b/yt_dlp/extractor/viidea.py @@ -2,10 +2,10 @@ from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urlparse, ) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, js_to_json, @@ -133,9 +133,9 @@ def _real_extract(self, url): '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: msg = self._parse_json( - e.cause.read().decode('utf-8'), lecture_id) + e.cause.response.read().decode('utf-8'), lecture_id) raise ExtractorError(msg['detail'], expected=True) raise diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index d81d9c5518..e72fa50fa8 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -2,20 +2,16 @@ import functools import re import itertools -import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urlparse, -) +from ..compat import compat_str, compat_urlparse +from ..networking import HEADRequest, Request +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, determine_ext, ExtractorError, get_element_by_class, - HEADRequest, js_to_json, int_or_none, merge_dicts, @@ -23,7 +19,6 @@ parse_filesize, parse_iso8601, parse_qs, - sanitized_Request, smuggle_url, str_or_none, try_get, @@ -72,7 +67,7 @@ def _perform_login(self, username, password): 'Referer': self._LOGIN_URL, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418: + if isinstance(e.cause, HTTPError) and e.cause.status == 418: raise ExtractorError( 'Unable to log in: bad username or password', expected=True) @@ -809,7 +804,7 @@ def _try_album_password(self, url): 'X-Requested-With': 'XMLHttpRequest', }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError('Wrong password', expected=True) raise @@ -832,10 +827,10 @@ def _real_extract(self, url): # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( url, video_id, headers=headers) - redirect_url = urlh.geturl() + redirect_url = urlh.url except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - errmsg = ee.cause.read() + if isinstance(ee.cause, HTTPError) and ee.cause.status == 403: + errmsg = ee.cause.response.read() if b'Because of its privacy settings, this video cannot be played here' in errmsg: raise ExtractorError( 'Cannot download embed-only video without embedding ' @@ -1154,7 +1149,7 @@ def _fetch_page(self, album_id, authorization, hashed_pass, page): 'Authorization': 'jwt ' + authorization, })['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: return for video in videos: link = video.get('link') @@ -1196,7 +1191,7 @@ def _real_extract(self, url): 'X-Requested-With': 'XMLHttpRequest', })['hashed_pass'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError('Wrong password', expected=True) raise entries = OnDemandPagedList(functools.partial( @@ -1309,10 +1304,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): # XXX: Do not subclass from concrete I def _page_url(self, base_url, pagenum): url = '%s/page:%d/' % (base_url, pagenum) - request = sanitized_Request(url) + request = Request(url) # Set the header to get a partial html page with the ids, # the normal page doesn't contain them. - request.add_header('X-Requested-With', 'XMLHttpRequest') + request.headers['X-Requested-With'] = 'XMLHttpRequest' return request def _real_extract(self, url): @@ -1432,7 +1427,7 @@ def _real_extract(self, url): **self._hidden_inputs(password_form), }), note='Logging in with video password') except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 418: + if isinstance(e.cause, HTTPError) and e.cause.status == 418: raise ExtractorError('Wrong video password', expected=True) raise diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 5753690283..6b7379d46c 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -36,7 +36,7 @@ class VKBaseIE(InfoExtractor): def _download_webpage_handle(self, url_or_request, video_id, *args, fatal=True, **kwargs): response = super()._download_webpage_handle(url_or_request, video_id, *args, fatal=fatal, **kwargs) - challenge_url, cookie = response[1].geturl() if response else '', None + challenge_url, cookie = response[1].url if response else '', None if challenge_url.startswith('https://vk.com/429.html?'): cookie = self._get_cookies(challenge_url).get('hash429') if not cookie: diff --git a/yt_dlp/extractor/vocaroo.py b/yt_dlp/extractor/vocaroo.py index 704e25c227..d98fbfd2d9 100644 --- a/yt_dlp/extractor/vocaroo.py +++ b/yt_dlp/extractor/vocaroo.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - HEADRequest, - float_or_none, -) +from ..networking import HEADRequest +from ..utils import float_or_none class VocarooIE(InfoExtractor): diff --git a/yt_dlp/extractor/vodlocker.py b/yt_dlp/extractor/vodlocker.py index 1c7236ed31..b215d6c9d6 100644 --- a/yt_dlp/extractor/vodlocker.py +++ b/yt_dlp/extractor/vodlocker.py @@ -1,10 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - NO_DEFAULT, - sanitized_Request, - urlencode_postdata, -) +from ..networking import Request +from ..utils import NO_DEFAULT, ExtractorError, urlencode_postdata class VodlockerIE(InfoExtractor): @@ -37,8 +33,8 @@ def _real_extract(self, url): if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! post = urlencode_postdata(fields) - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') + req = Request(url, post) + req.headers['Content-type'] = 'application/x-www-form-urlencoded' webpage = self._download_webpage( req, video_id, 'Downloading video page') diff --git a/yt_dlp/extractor/voot.py b/yt_dlp/extractor/voot.py index dd41647aa9..b19a279344 100644 --- a/yt_dlp/extractor/voot.py +++ b/yt_dlp/extractor/voot.py @@ -1,10 +1,10 @@ import json import time -import urllib.error import uuid from .common import InfoExtractor from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -140,7 +140,7 @@ def _real_extract(self, url): 'voottoken': self._TOKEN, })['m3u8'] except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self._check_token_expiry() raise diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 0058357122..497233d95f 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,10 +1,10 @@ import functools import json import time -import urllib.error import urllib.parse from .gigya import GigyaBaseIE +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, clean_html, @@ -263,7 +263,7 @@ def _perform_login(self, username, password): '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, })) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: retry.error = e continue raise diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index ad9dc568a6..523c442e65 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -8,7 +8,8 @@ import urllib.parse from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_urllib_parse_urlencode +from ..compat import compat_urllib_parse_urlencode +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -54,8 +55,8 @@ def _call_api(self, path, video_id, note, data=None): '?'.join([base_url, encoded_query]), video_id, note='Downloading %s JSON metadata' % note, headers=headers, data=data) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError(json.loads(e.cause.response.read().decode())['message'], expected=True) raise def _call_cms(self, path, video_id, note): diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 81a23b9df3..bc9a71abe0 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -31,7 +31,7 @@ def _real_extract(self, url): # to get Referer url for genvisitor webpage, urlh = self._download_webpage_handle(url, video_id) - visitor_url = urlh.geturl() + visitor_url = urlh.url if 'passport.weibo.com' in visitor_url: # first visit diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index 8f2a7ee06b..9a08b8e43b 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -5,13 +5,13 @@ import json import re import time -import urllib.error import urllib.parse import uuid from .common import InfoExtractor from .naver import NaverBaseIE from .youtube import YoutubeIE +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, UserNotLive, @@ -59,7 +59,7 @@ def _perform_login(self, username, password): 'password': password, }, separators=(',', ':')).encode(), headers=headers, note='Logging in') except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError('Invalid password provided', expected=True) raise @@ -97,10 +97,10 @@ def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): 'wmd': wmd, }) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_login_required( 'Session token has expired. Log in again or refresh cookies in browser') - elif isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + elif isinstance(e.cause, HTTPError) and e.cause.status == 403: raise ExtractorError('Your account does not have access to this content', expected=True) raise diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index 884fa4b5fd..bce5e8326b 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -1,12 +1,12 @@ import re -import urllib.error import urllib.parse from base64 import b64decode from .common import InfoExtractor +from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, - HEADRequest, determine_ext, float_or_none, int_or_none, @@ -365,7 +365,7 @@ def _real_extract(self, url): try: data = self._download_embed_config('channel', channel_id, url) - except (ExtractorError, urllib.error.HTTPError): + except (ExtractorError, HTTPError): # Some channels give a 403 from the JSON API self.report_warning('Failed to download channel data from API, falling back to webpage.') webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) diff --git a/yt_dlp/extractor/wykop.py b/yt_dlp/extractor/wykop.py index 0fa6d524db..1d29cc89b4 100644 --- a/yt_dlp/extractor/wykop.py +++ b/yt_dlp/extractor/wykop.py @@ -1,7 +1,7 @@ import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, format_field, @@ -43,7 +43,7 @@ def _call_api(self, path, video_id, note='Downloading JSON metadata'): try: return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'}) except ExtractorError as e: - if not retrying and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + if not retrying and isinstance(e.cause, HTTPError) and e.cause.status == 403: token = self._get_token(True) continue raise diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index 7af6c8f037..37224799bf 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -183,7 +183,7 @@ def get_height(s): 'height': get_height(quality), 'filesize': format_sizes.get(quality), 'http_headers': { - 'Referer': urlh.geturl(), + 'Referer': urlh.url, }, }) xplayer_sources = try_get( diff --git a/yt_dlp/extractor/xtube.py b/yt_dlp/extractor/xtube.py index ce4480c7d8..db82925896 100644 --- a/yt_dlp/extractor/xtube.py +++ b/yt_dlp/extractor/xtube.py @@ -2,12 +2,12 @@ import re from .common import InfoExtractor +from ..networking import Request from ..utils import ( int_or_none, js_to_json, orderedSet, parse_duration, - sanitized_Request, str_to_int, url_or_none, ) @@ -186,7 +186,7 @@ def _real_extract(self, url): entries = [] for pagenum in itertools.count(1): - request = sanitized_Request( + request = Request( 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum), headers={ 'Cookie': 'popunder=4', diff --git a/yt_dlp/extractor/yesjapan.py b/yt_dlp/extractor/yesjapan.py index b45fa8f144..94e41660de 100644 --- a/yt_dlp/extractor/yesjapan.py +++ b/yt_dlp/extractor/yesjapan.py @@ -1,9 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - HEADRequest, - get_element_by_attribute, - parse_iso8601, -) +from ..networking import HEADRequest +from ..utils import get_element_by_attribute, parse_iso8601 class YesJapanIE(InfoExtractor): @@ -42,7 +39,7 @@ def _real_extract(self, url): req = self._request_webpage( redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False) if req: - video_url = req.geturl() + video_url = req.url formats = [{ 'format_id': 'sd', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 826bbb20e1..2b3776aa1d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -15,13 +15,13 @@ import threading import time import traceback -import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..compat import functools from ..jsinterp import JSInterpreter +from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, ExtractorError, @@ -41,7 +41,6 @@ join_nonempty, js_to_json, mimetype2ext, - network_exceptions, orderedSet, parse_codecs, parse_count, @@ -959,15 +958,15 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers except ExtractorError as e: if not isinstance(e.cause, network_exceptions): return self._error_or_warning(e, fatal=fatal) - elif not isinstance(e.cause, urllib.error.HTTPError): + elif not isinstance(e.cause, HTTPError): retry.error = e continue - first_bytes = e.cause.read(512) + first_bytes = e.cause.response.read(512) if not is_html(first_bytes): yt_error = try_get( self._parse_json( - self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), lambda x: x['error']['message'], str) if yt_error: self._report_alerts([('ERROR', yt_error)], fatal=False) @@ -975,7 +974,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 - if e.cause.code not in (403, 429): + if e.cause.status not in (403, 429): retry.error = e continue return self._error_or_warning(e, fatal=fatal) @@ -2837,7 +2836,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, HTTPError) and last_error.status == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -5263,7 +5262,7 @@ def _extract_webpage(self, url, item_id, fatal=True): data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429): retry.error = e continue self._error_or_warning(e, fatal=fatal) diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index 84cee4445e..0ccacbb6aa 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -16,7 +16,7 @@ class ZaikoBaseIE(InfoExtractor): def _download_real_webpage(self, url, video_id): webpage, urlh = self._download_webpage_handle(url, video_id) - final_url = urlh.geturl() + final_url = urlh.url if 'zaiko.io/login' in final_url: self.raise_login_required() elif '/_buy/' in final_url: diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 22620c0a32..6bd9ea064e 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -2,7 +2,8 @@ from uuid import uuid4 from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -36,7 +37,7 @@ def _perform_login(self, username, password): 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError( 'Unable to login: incorrect username and/or password', expected=True) diff --git a/yt_dlp/extractor/zype.py b/yt_dlp/extractor/zype.py index 8cf994505a..2f3b4c47f5 100644 --- a/yt_dlp/extractor/zype.py +++ b/yt_dlp/extractor/zype.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( dict_get, ExtractorError, @@ -37,9 +37,9 @@ def _real_extract(self, url): response = self._download_json(re.sub( r'\.(?:js|html)\?', '.json?', url), video_id)['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 403): raise ExtractorError(self._parse_json( - e.cause.read().decode(), video_id)['message'], expected=True) + e.cause.response.read().decode(), video_id)['message'], expected=True) raise body = response['body'] diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index e4b3628276..458eca39f8 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -24,6 +24,7 @@ from ..utils import ( bug_reports_message, classproperty, + deprecation_warning, error_to_str, escape_url, update_url_query, @@ -507,16 +508,21 @@ def get_header(self, name, default=None): # The following methods are for compatability reasons and are deprecated @property def code(self): + deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2) return self.status def getcode(self): + deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2) return self.status def geturl(self): + deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2) return self.url def info(self): + deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2) return self.headers def getheader(self, name, default=None): + deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2) return self.get_header(name, default) diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 6fe8afb925..10afc9ccbf 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -3,7 +3,7 @@ import typing import urllib.error -from ..utils import YoutubeDLError +from ..utils import YoutubeDLError, deprecation_warning if typing.TYPE_CHECKING: from .common import RequestHandler, Response @@ -137,6 +137,7 @@ def reason(self, value): @property def headers(self): + deprecation_warning('HTTPError.headers is deprecated, use HTTPError.response.headers instead') return self._http_error.response.headers @headers.setter @@ -144,16 +145,20 @@ def headers(self, value): return def info(self): + deprecation_warning('HTTPError.info() is deprecated, use HTTPError.response.headers instead') return self.response.headers def getcode(self): + deprecation_warning('HTTPError.getcode is deprecated, use HTTPError.status instead') return self.status def geturl(self): + deprecation_warning('HTTPError.geturl is deprecated, use HTTPError.response.url instead') return self.response.url @property def code(self): + deprecation_warning('HTTPError.code is deprecated, use HTTPError.status instead') return self.status @code.setter @@ -162,6 +167,7 @@ def code(self, value): @property def url(self): + deprecation_warning('HTTPError.url is deprecated, use HTTPError.response.url instead') return self.response.url @url.setter @@ -170,6 +176,7 @@ def url(self, value): @property def hdrs(self): + deprecation_warning('HTTPError.hdrs is deprecated, use HTTPError.response.headers instead') return self.response.headers @hdrs.setter @@ -178,6 +185,7 @@ def hdrs(self, value): @property def filename(self): + deprecation_warning('HTTPError.filename is deprecated, use HTTPError.response.url instead') return self.response.url @filename.setter @@ -185,6 +193,18 @@ def filename(self, value): return def __getattr__(self, name): + # File operations are passed through the response. + # Warn for some commonly used ones + passthrough_warnings = { + 'read': 'response.read()', + # technically possibly due to passthrough, but we should discourage this + 'get_header': 'response.get_header()', + 'readable': 'response.readable()', + 'closed': 'response.closed', + 'tell': 'response.tell()', + } + if name in passthrough_warnings: + deprecation_warning(f'HTTPError.{name} is deprecated, use HTTPError.{passthrough_warnings[name]} instead') return super().__getattr__(name) def __str__(self): diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index 08b0fe1ff9..8cef86c43a 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -1,16 +1,15 @@ import functools import json import os -import urllib.error +from ..networking import Request +from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( PostProcessingError, RetryManager, _configuration_args, deprecation_warning, encodeFilename, - network_exceptions, - sanitized_Request, ) @@ -203,13 +202,13 @@ def _download_json(self, url, *, expected_http_errors=(404,)): self.write_debug(f'{self.PP_NAME} query: {url}') for retry in RetryManager(self.get_param('extractor_retries', 3), self._retry_download): try: - rsp = self._downloader.urlopen(sanitized_Request(url)) + rsp = self._downloader.urlopen(Request(url)) except network_exceptions as e: - if isinstance(e, urllib.error.HTTPError) and e.code in expected_http_errors: + if isinstance(e, HTTPError) and e.status in expected_http_errors: return None retry.error = PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}') continue - return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) + return json.loads(rsp.read().decode(rsp.headers.get_param('charset') or 'utf-8')) class AudioConversionError(PostProcessingError): # Deprecated diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 4790075eb6..d708b09e35 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -7,19 +7,18 @@ import re import subprocess import sys -import urllib.error from zipimport import zipimporter from .compat import functools # isort: split from .compat import compat_realpath, compat_shlex_quote +from .networking import Request +from .networking.exceptions import HTTPError, network_exceptions from .utils import ( Popen, cached_method, deprecation_warning, - network_exceptions, remove_end, remove_start, - sanitized_Request, shell_quote, system_identifier, version_tuple, @@ -190,7 +189,7 @@ def _tag(self): def _get_version_info(self, tag): url = f'{API_BASE_URL}/{self._target_repo}/releases/{tag}' self.ydl.write_debug(f'Fetching release info: {url}') - return json.loads(self.ydl.urlopen(sanitized_Request(url, headers={ + return json.loads(self.ydl.urlopen(Request(url, headers={ 'Accept': 'application/vnd.github+json', 'User-Agent': 'yt-dlp', 'X-GitHub-Api-Version': '2022-11-28', @@ -315,7 +314,7 @@ def update(self): try: newcontent = self._download(self.release_name, self._tag) except network_exceptions as e: - if isinstance(e, urllib.error.HTTPError) and e.code == 404: + if isinstance(e, HTTPError) and e.status == 404: return self._report_error( f'The requested tag {self._label(self.target_channel, self.target_tag)} does not exist', True) return self._report_network_error(f'fetch updates: {e}') diff --git a/yt_dlp/utils/_deprecated.py b/yt_dlp/utils/_deprecated.py index e55d42354a..a8ae8ecb5d 100644 --- a/yt_dlp/utils/_deprecated.py +++ b/yt_dlp/utils/_deprecated.py @@ -10,16 +10,6 @@ from ._utils import preferredencoding -from ..networking._urllib import HTTPHandler - -# isort: split -from .networking import random_user_agent, std_headers # noqa: F401 -from ..networking._urllib import PUTRequest # noqa: F401 -from ..networking._urllib import SUPPORTED_ENCODINGS, HEADRequest # noqa: F401 -from ..networking._urllib import ProxyHandler as PerRequestProxyHandler # noqa: F401 -from ..networking._urllib import RedirectHandler as YoutubeDLRedirectHandler # noqa: F401 -from ..networking._urllib import make_socks_conn_class, update_Request # noqa: F401 -from ..networking.exceptions import network_exceptions # noqa: F401 def encodeFilename(s, for_subprocess=False): @@ -47,12 +37,3 @@ def decodeOption(optval): def error_to_compat_str(err): return str(err) - - -class YoutubeDLHandler(HTTPHandler): - def __init__(self, params, *args, **kwargs): - self._params = params - super().__init__(*args, **kwargs) - - -YoutubeDLHTTPSHandler = YoutubeDLHandler diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 96ac468b1f..0770009717 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -1,17 +1,30 @@ """No longer used and new code should not use. Exists only for API compat.""" - import platform import struct import sys +import urllib.error import urllib.parse +import urllib.request import zlib from ._utils import Popen, decode_base_n, preferredencoding from .traversal import traverse_obj from ..dependencies import certifi, websockets +from ..networking._helper import make_ssl_context +from ..networking._urllib import HTTPHandler # isort: split +from .networking import random_user_agent, std_headers # noqa: F401 from ..cookies import YoutubeDLCookieJar # noqa: F401 +from ..networking._urllib import PUTRequest # noqa: F401 +from ..networking._urllib import SUPPORTED_ENCODINGS, HEADRequest # noqa: F401 +from ..networking._urllib import ProxyHandler as PerRequestProxyHandler # noqa: F401 +from ..networking._urllib import RedirectHandler as YoutubeDLRedirectHandler # noqa: F401 +from ..networking._urllib import ( # noqa: F401 + make_socks_conn_class, + update_Request, +) +from ..networking.exceptions import HTTPError, network_exceptions # noqa: F401 has_certifi = bool(certifi) has_websockets = bool(websockets) @@ -176,5 +189,52 @@ def handle_youtubedl_headers(headers): return filtered_headers +def request_to_url(req): + if isinstance(req, urllib.request.Request): + return req.get_full_url() + else: + return req + + +def sanitized_Request(url, *args, **kwargs): + from ..utils import escape_url, extract_basic_auth, sanitize_url + url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) + if auth_header is not None: + headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) + headers['Authorization'] = auth_header + return urllib.request.Request(url, *args, **kwargs) + + +class YoutubeDLHandler(HTTPHandler): + def __init__(self, params, *args, **kwargs): + self._params = params + super().__init__(*args, **kwargs) + + +YoutubeDLHTTPSHandler = YoutubeDLHandler + + +class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + return urllib.request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = urllib.request.HTTPCookieProcessor.http_request + https_response = http_response + + +def make_HTTPS_handler(params, **kwargs): + return YoutubeDLHTTPSHandler(params, context=make_ssl_context( + verify=not params.get('nocheckcertificate'), + client_certificate=params.get('client_certificate'), + client_certificate_key=params.get('client_certificate_key'), + client_certificate_password=params.get('client_certificate_password'), + legacy_support=params.get('legacyserverconnect'), + use_certifi='no-certifi' not in params.get('compat_opts', []), + ), **kwargs) + + def process_communicate_or_kill(p, *args, **kwargs): return Popen.communicate_or_kill(p, *args, **kwargs) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index d0e3287166..2e619f9ea4 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -62,11 +62,6 @@ compiled_regex_type = type(re.compile('')) -USER_AGENTS = { - 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', -} - - class NO_DEFAULT: pass @@ -727,14 +722,6 @@ def extract_basic_auth(url): return url, f'Basic {auth_payload.decode()}' -def sanitized_Request(url, *args, **kwargs): - url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) - if auth_header is not None: - headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) - headers['Authorization'] = auth_header - return urllib.request.Request(url, *args, **kwargs) - - def expand_path(s): """Expand shell variables and ~""" return os.path.expandvars(compat_expanduser(s)) @@ -894,19 +881,6 @@ def formatSeconds(secs, delim=':', msec=False): return '%s.%03d' % (ret, time.milliseconds) if msec else ret -def make_HTTPS_handler(params, **kwargs): - from ._deprecated import YoutubeDLHTTPSHandler - from ..networking._helper import make_ssl_context - return YoutubeDLHTTPSHandler(params, context=make_ssl_context( - verify=not params.get('nocheckcertificate'), - client_certificate=params.get('client_certificate'), - client_certificate_key=params.get('client_certificate_key'), - client_certificate_password=params.get('client_certificate_password'), - legacy_support=params.get('legacyserverconnect'), - use_certifi='no-certifi' not in params.get('compat_opts', []), - ), **kwargs) - - def bug_reports_message(before=';'): from ..update import REPOSITORY @@ -1143,17 +1117,6 @@ def is_path_like(f): return isinstance(f, (str, bytes, os.PathLike)) -class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): - def __init__(self, cookiejar=None): - urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) - - def http_response(self, request, response): - return urllib.request.HTTPCookieProcessor.http_response(self, request, response) - - https_request = urllib.request.HTTPCookieProcessor.http_request - https_response = http_response - - def extract_timezone(date_str): m = re.search( r'''(?x) @@ -1455,6 +1418,7 @@ def write_string(s, out=None, encoding=None): out.flush() +# TODO: Use global logger def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): from .. import _IN_CLI if _IN_CLI: @@ -2005,13 +1969,6 @@ def url_or_none(url): return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None -def request_to_url(req): - if isinstance(req, urllib.request.Request): - return req.get_full_url() - else: - return req - - def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): datetime_object = None try: @@ -5525,7 +5482,7 @@ def info(self, message): def warning(self, message, *, once=False): if self._ydl: - self._ydl.report_warning(message, only_once=once) + self._ydl.report_warning(message, once) def error(self, message, *, is_error=True): if self._ydl: From 131d132da5c98c6c78bd7eed4b37f4458561b3d9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 15 Jul 2023 16:39:45 +0530 Subject: [PATCH 277/501] [build] Make sure deprecated modules are added --- setup.py | 3 ++- yt_dlp/__pyinstaller/hook-yt_dlp.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ccfcf42520..a2f9f55c36 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,8 @@ def py2exe_params(): 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], # Modules that are only imported dynamically must be added here - 'includes': ['yt_dlp.compat._legacy'], + 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', + 'yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated'], }, 'zipfile': None, } diff --git a/yt_dlp/__pyinstaller/hook-yt_dlp.py b/yt_dlp/__pyinstaller/hook-yt_dlp.py index 63dcdffe02..88c2b8b285 100644 --- a/yt_dlp/__pyinstaller/hook-yt_dlp.py +++ b/yt_dlp/__pyinstaller/hook-yt_dlp.py @@ -18,7 +18,8 @@ def pycryptodome_module(): def get_hidden_imports(): - yield 'yt_dlp.compat._legacy' + yield from ('yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated') + yield from ('yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated') yield pycryptodome_module() yield from collect_submodules('websockets') # These are auto-detected, but explicitly add them just in case From 2b029ca0a9f9105c4f7626993fa60e54c9782749 Mon Sep 17 00:00:00 2001 From: Aaruni Kaushik <aaruni96@users.noreply.github.com> Date: Sat, 15 Jul 2023 21:15:08 +0200 Subject: [PATCH 278/501] [cleanup] Add color to `download-archive` message (#5138) Authored by: aaruni96, Grub4K, pukkandan Closes #4913 --- yt_dlp/YoutubeDL.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 850eb8ae0a..c49960782d 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1492,7 +1492,10 @@ def check_filter(): return ret if self.in_download_archive(info_dict): - reason = '%s has already been recorded in the archive' % video_title + reason = ''.join(( + format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '), + format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '), + 'has already been recorded in the archive')) break_opt, break_err = 'break_on_existing', ExistingVideoReached else: try: @@ -1553,7 +1556,8 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None, temp_id = ie.get_temp_id(url) if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}): - self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive') + self.to_screen(f'[download] {self._format_screen(temp_id, self.Styles.ID)}: ' + 'has already been recorded in the archive') if self.params.get('break_on_existing', False): raise ExistingVideoReached() break From 6c5211cebeacfc53ad5d5ddf4a659be76039656f Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 15 Jul 2023 15:22:10 -0500 Subject: [PATCH 279/501] [core] Fix HTTP headers and cookie handling - Remove `Cookie` header from `http_headers` immediately after loading into cookiejar - Restore compat for `--load-info-json` cookies - Add more tests - Fix improper passing of Cookie header by `MailRu` extractor Closes #7558 Authored by: bashonly, pukkandan --- test/test_YoutubeDL.py | 85 +++++++++++++++++++++++++++++---- test/test_YoutubeDLCookieJar.py | 8 ++++ yt_dlp/YoutubeDL.py | 46 ++++++++++++------ yt_dlp/downloader/common.py | 6 --- yt_dlp/extractor/mailru.py | 8 ++-- 5 files changed, 120 insertions(+), 33 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index c15c7704c5..b4f770ca58 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -11,7 +11,7 @@ import copy import json -from test.helper import FakeYDL, assertRegexpMatches +from test.helper import FakeYDL, assertRegexpMatches, try_rm from yt_dlp import YoutubeDL from yt_dlp.compat import compat_os_name from yt_dlp.extractor import YoutubeIE @@ -24,6 +24,8 @@ int_or_none, match_filter_func, ) +from yt_dlp.utils.traversal import traverse_obj + TEST_URL = 'http://localhost/sample.mp4' @@ -1227,10 +1229,10 @@ def cookie(name, value, version=None, domain='', path='', secure=False, expires= _test_url = 'https://yt.dlp/test' - def test(encoded_cookies, cookies, headers=False, round_trip=None, error=None): + def test(encoded_cookies, cookies, *, headers=False, round_trip=None, error_re=None): def _test(): ydl.cookiejar.clear() - ydl._load_cookies(encoded_cookies, from_headers=headers) + ydl._load_cookies(encoded_cookies, autoscope=headers) if headers: ydl._apply_header_cookies(_test_url) data = {'url': _test_url} @@ -1245,14 +1247,14 @@ def _test(): ydl.__dict__['_YoutubeDL__header_cookies'] = [] with self.subTest(msg=encoded_cookies): - if not error: + if not error_re: _test() return - with self.assertRaisesRegex(Exception, error): + with self.assertRaisesRegex(Exception, error_re): _test() test('test=value; Domain=.yt.dlp', [cookie('test', 'value', domain='.yt.dlp')]) - test('test=value', [cookie('test', 'value')], error='Unscoped cookies are not allowed') + test('test=value', [cookie('test', 'value')], error_re=r'Unscoped cookies are not allowed') test('cookie1=value1; Domain=.yt.dlp; Path=/test; cookie2=value2; Domain=.yt.dlp; Path=/', [ cookie('cookie1', 'value1', domain='.yt.dlp', path='/test'), cookie('cookie2', 'value2', domain='.yt.dlp', path='/')]) @@ -1265,9 +1267,76 @@ def _test(): round_trip='name=""; Domain=.yt.dlp') test('test=value', [cookie('test', 'value', domain='.yt.dlp')], headers=True) - test('cookie1=value; Domain=.yt.dlp; cookie2=value', [], headers=True, error='Invalid syntax') + test('cookie1=value; Domain=.yt.dlp; cookie2=value', [], headers=True, error_re=r'Invalid syntax') ydl.deprecated_feature = ydl.report_error - test('test=value', [], headers=True, error='Passing cookies as a header is a potential security risk') + test('test=value', [], headers=True, error_re=r'Passing cookies as a header is a potential security risk') + + def test_infojson_cookies(self): + TEST_FILE = 'test_infojson_cookies.info.json' + TEST_URL = 'https://example.com/example.mp4' + COOKIES = 'a=b; Domain=.example.com; c=d; Domain=.example.com' + COOKIE_HEADER = {'Cookie': 'a=b; c=d'} + + ydl = FakeYDL() + ydl.process_info = lambda x: ydl._write_info_json('test', x, TEST_FILE) + + def make_info(info_header_cookies=False, fmts_header_cookies=False, cookies_field=False): + fmt = {'url': TEST_URL} + if fmts_header_cookies: + fmt['http_headers'] = COOKIE_HEADER + if cookies_field: + fmt['cookies'] = COOKIES + return _make_result([fmt], http_headers=COOKIE_HEADER if info_header_cookies else None) + + def test(initial_info, note): + result = {} + result['processed'] = ydl.process_ie_result(initial_info) + self.assertTrue(ydl.cookiejar.get_cookies_for_url(TEST_URL), + msg=f'No cookies set in cookiejar after initial process when {note}') + ydl.cookiejar.clear() + with open(TEST_FILE) as infojson: + result['loaded'] = ydl.sanitize_info(json.load(infojson), True) + result['final'] = ydl.process_ie_result(result['loaded'].copy(), download=False) + self.assertTrue(ydl.cookiejar.get_cookies_for_url(TEST_URL), + msg=f'No cookies set in cookiejar after final process when {note}') + ydl.cookiejar.clear() + for key in ('processed', 'loaded', 'final'): + info = result[key] + self.assertIsNone( + traverse_obj(info, ((None, ('formats', 0)), 'http_headers', 'Cookie'), casesense=False, get_all=False), + msg=f'Cookie header not removed in {key} result when {note}') + self.assertEqual( + traverse_obj(info, ((None, ('formats', 0)), 'cookies'), get_all=False), COOKIES, + msg=f'No cookies field found in {key} result when {note}') + + test({'url': TEST_URL, 'http_headers': COOKIE_HEADER, 'id': '1', 'title': 'x'}, 'no formats field') + test(make_info(info_header_cookies=True), 'info_dict header cokies') + test(make_info(fmts_header_cookies=True), 'format header cookies') + test(make_info(info_header_cookies=True, fmts_header_cookies=True), 'info_dict and format header cookies') + test(make_info(info_header_cookies=True, fmts_header_cookies=True, cookies_field=True), 'all cookies fields') + test(make_info(cookies_field=True), 'cookies format field') + test({'url': TEST_URL, 'cookies': COOKIES, 'id': '1', 'title': 'x'}, 'info_dict cookies field only') + + try_rm(TEST_FILE) + + def test_add_headers_cookie(self): + def check_for_cookie_header(result): + return traverse_obj(result, ((None, ('formats', 0)), 'http_headers', 'Cookie'), casesense=False, get_all=False) + + ydl = FakeYDL({'http_headers': {'Cookie': 'a=b'}}) + ydl._apply_header_cookies(_make_result([])['webpage_url']) # Scope to input webpage URL: .example.com + + fmt = {'url': 'https://example.com/video.mp4'} + result = ydl.process_ie_result(_make_result([fmt]), download=False) + self.assertIsNone(check_for_cookie_header(result), msg='http_headers cookies in result info_dict') + self.assertEqual(result.get('cookies'), 'a=b; Domain=.example.com', msg='No cookies were set in cookies field') + self.assertIn('a=b', ydl.cookiejar.get_cookie_header(fmt['url']), msg='No cookies were set in cookiejar') + + fmt = {'url': 'https://wrong.com/video.mp4'} + result = ydl.process_ie_result(_make_result([fmt]), download=False) + self.assertIsNone(check_for_cookie_header(result), msg='http_headers cookies for wrong domain') + self.assertFalse(result.get('cookies'), msg='Cookies set in cookies field for wrong domain') + self.assertFalse(ydl.cookiejar.get_cookie_header(fmt['url']), msg='Cookies set in cookiejar for wrong domain') if __name__ == '__main__': diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 2c73d7d853..0b7a0acdb5 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -53,6 +53,14 @@ def test_get_cookie_header(self): header = cookiejar.get_cookie_header('https://www.foobar.foobar') self.assertIn('HTTPONLY_COOKIE', header) + def test_get_cookies_for_url(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + cookies = cookiejar.get_cookies_for_url('https://www.foobar.foobar/') + self.assertEqual(len(cookies), 2) + cookies = cookiejar.get_cookies_for_url('https://foobar.foobar/') + self.assertFalse(cookies) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index c49960782d..1a2f42fe9a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -680,14 +680,15 @@ def process_color_policy(stream): self.params['compat_opts'] = set(self.params.get('compat_opts', ())) self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) + self.__header_cookies = [] + self._load_cookies(self.params['http_headers'].get('Cookie')) # compat + self.params['http_headers'].pop('Cookie', None) + self._request_director = self.build_request_director( sorted(_REQUEST_HANDLERS.values(), key=lambda rh: rh.RH_NAME.lower())) if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() - self.__header_cookies = [] - self._load_cookies(traverse_obj(self.params.get('http_headers'), 'cookie', casesense=False)) # compat - def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: self.report_warning(f'{option} is deprecated. Use {suggestion} instead') @@ -1645,18 +1646,19 @@ def progress(msg): self.to_screen('') raise - def _load_cookies(self, data, *, from_headers=True): + def _load_cookies(self, data, *, autoscope=True): """Loads cookies from a `Cookie` header This tries to work around the security vulnerability of passing cookies to every domain. See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj - The unscoped cookies are saved for later to be stored in the jar with a limited scope. @param data The Cookie header as string to load the cookies from - @param from_headers If `False`, allows Set-Cookie syntax in the cookie string (at least a domain will be required) + @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains + If `True`, save cookies for later to be stored in the jar with a limited scope + If a URL, save cookies in the jar with the domain of the URL """ for cookie in LenientSimpleCookie(data).values(): - if from_headers and any(cookie.values()): + if autoscope and any(cookie.values()): raise ValueError('Invalid syntax in Cookie Header') domain = cookie.get('domain') or '' @@ -1670,17 +1672,23 @@ def _load_cookies(self, data, *, from_headers=True): if domain: self.cookiejar.set_cookie(prepared_cookie) - elif from_headers: + elif autoscope is True: self.deprecated_feature( 'Passing cookies as a header is a potential security risk; ' 'they will be scoped to the domain of the downloaded urls. ' 'Please consider loading cookies from a file or browser instead.') self.__header_cookies.append(prepared_cookie) + elif autoscope: + self.report_warning( + 'The extractor result contains an unscoped cookie as an HTTP header. ' + f'If you are using yt-dlp with an input URL{bug_reports_message(before=",")}', + only_once=True) + self._apply_header_cookies(autoscope, [prepared_cookie]) else: self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping', tb=False, is_error=False) - def _apply_header_cookies(self, url): + def _apply_header_cookies(self, url, cookies=None): """Applies stray header cookies to the provided url This loads header cookies and scopes them to the domain provided in `url`. @@ -1691,7 +1699,7 @@ def _apply_header_cookies(self, url): if not parsed.hostname: return - for cookie in map(copy.copy, self.__header_cookies): + for cookie in map(copy.copy, cookies or self.__header_cookies): cookie.domain = f'.{parsed.hostname}' self.cookiejar.set_cookie(cookie) @@ -2481,9 +2489,16 @@ def restore_last_token(self): parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) - def _calc_headers(self, info_dict): + def _calc_headers(self, info_dict, load_cookies=False): res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers')) clean_headers(res) + + if load_cookies: # For --load-info-json + self._load_cookies(res.get('Cookie'), autoscope=info_dict['url']) # compat + self._load_cookies(info_dict.get('cookies'), autoscope=False) + # The `Cookie` header is removed to prevent leaks and unscoped cookies. + # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + res.pop('Cookie', None) cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) if cookies: encoder = LenientSimpleCookie() @@ -2762,7 +2777,12 @@ def is_wellformed(f): and info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) - format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict)) + format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True) + + # Safeguard against old/insecure infojson when using --load-info-json + if info_dict.get('http_headers'): + info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers']) + info_dict['http_headers'].pop('Cookie', None) # This is copied to http_headers by the above _calc_headers and can now be removed if '__x_forwarded_for_ip' in info_dict: @@ -3508,8 +3528,6 @@ def download_with_info_file(self, info_filename): infos = [self.sanitize_info(info, self.params.get('clean_infojson', True)) for info in variadic(json.loads('\n'.join(f)))] for info in infos: - self._load_cookies(info.get('cookies'), from_headers=False) - self._load_cookies(traverse_obj(info.get('http_headers'), 'Cookie', casesense=False)) # compat try: self.__download_wrapper(self.process_ie_result)(info, download=True) except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 2c404ee902..b71d7ee8f2 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -32,7 +32,6 @@ timetuple_from_msec, try_call, ) -from ..utils.traversal import traverse_obj class FileDownloader: @@ -453,11 +452,6 @@ def download(self, filename, info_dict, subtitle=False): self.to_screen(f'[download] Sleeping {sleep_interval:.2f} seconds ...') time.sleep(sleep_interval) - # Filter the `Cookie` header from the info_dict to prevent leaks. - # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj - info_dict['http_headers'] = dict(traverse_obj(info_dict, ( - 'http_headers', {dict.items}, lambda _, pair: pair[0].lower() != 'cookie'))) or None - ret = self.real_download(filename, info_dict) self._finish_multiline_status() return ret, True diff --git a/yt_dlp/extractor/mailru.py b/yt_dlp/extractor/mailru.py index 387d211fe1..0f0550c921 100644 --- a/yt_dlp/extractor/mailru.py +++ b/yt_dlp/extractor/mailru.py @@ -1,6 +1,7 @@ import itertools import json import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote @@ -140,17 +141,15 @@ def _real_extract(self, url): 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') - headers = {} - video_key = self._get_cookies('https://my.mail.ru').get('video_key') - if video_key: - headers['Cookie'] = 'video_key=%s' % video_key.value formats = [] for f in video_data['videos']: video_url = f.get('url') if not video_url: continue + if video_key: + self._set_cookie(urllib.parse.urlparse(video_url).hostname, 'video_key', video_key.value) format_id = f.get('key') height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None @@ -158,7 +157,6 @@ def _real_extract(self, url): 'url': video_url, 'format_id': format_id, 'height': height, - 'http_headers': headers, }) meta_data = video_data['meta'] From 42ded0a429c20ec13dc006825e1508d9a02f0ad4 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 15 Jul 2023 15:18:25 -0500 Subject: [PATCH 280/501] [fd/external] Fixes to cookie handling - Fix bug in `axel` Cookie header arg - Pass cookies to `curl` as strings - Write session cookies for `aria2c` and `wget` Closes #7539 Authored by: bashonly --- test/test_downloader_external.py | 9 +++++---- yt_dlp/downloader/external.py | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py index e5b02ba5a4..d3d74df043 100644 --- a/test/test_downloader_external.py +++ b/test/test_downloader_external.py @@ -68,7 +68,7 @@ def test_make_cmd(self): ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) self.assertEqual( downloader._make_cmd('test', TEST_INFO), - ['axel', '-o', 'test', 'Cookie: test=ytdlp', '--max-redirect=0', '--', 'http://www.example.com/']) + ['axel', '-o', 'test', '-H', 'Cookie: test=ytdlp', '--max-redirect=0', '--', 'http://www.example.com/']) class TestWgetFD(unittest.TestCase): @@ -85,10 +85,11 @@ class TestCurlFD(unittest.TestCase): def test_make_cmd(self): with FakeYDL() as ydl: downloader = CurlFD(ydl, {}) - self.assertNotIn('--cookie-jar', downloader._make_cmd('test', TEST_INFO)) - # Test cookiejar tempfile arg is added + self.assertNotIn('--cookie', downloader._make_cmd('test', TEST_INFO)) + # Test cookie header is added ydl.cookiejar.set_cookie(http.cookiejar.Cookie(**TEST_COOKIE)) - self.assertIn('--cookie-jar', downloader._make_cmd('test', TEST_INFO)) + self.assertIn('--cookie', downloader._make_cmd('test', TEST_INFO)) + self.assertIn('test=ytdlp', downloader._make_cmd('test', TEST_INFO)) class TestAria2cFD(unittest.TestCase): diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index e307502db1..4f52f6e8df 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -137,7 +137,7 @@ def _write_cookies(self): self._cookies_tempfile = tmp_cookies.name self.to_screen(f'[download] Writing temporary cookies file to "{self._cookies_tempfile}"') # real_download resets _cookies_tempfile; if it's None then save() will write to cookiejar.filename - self.ydl.cookiejar.save(self._cookies_tempfile) + self.ydl.cookiejar.save(self._cookies_tempfile, ignore_discard=True, ignore_expires=True) return self.ydl.cookiejar.filename or self._cookies_tempfile def _call_downloader(self, tmpfilename, info_dict): @@ -199,8 +199,9 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] - if self.ydl.cookiejar.get_cookie_header(info_dict['url']): - cmd += ['--cookie-jar', self._write_cookies()] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['--cookie', cookie_header] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', f'{key}: {val}'] @@ -233,7 +234,7 @@ def _make_cmd(self, tmpfilename, info_dict): cmd += ['-H', f'{key}: {val}'] cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) if cookie_header: - cmd += [f'Cookie: {cookie_header}', '--max-redirect=0'] + cmd += ['-H', f'Cookie: {cookie_header}', '--max-redirect=0'] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd From 1d3d579c2142f69831b6ae140e1d8e824e07fa0e Mon Sep 17 00:00:00 2001 From: zhong-yiyu <53254770+zhong-yiyu@users.noreply.github.com> Date: Sat, 15 Jul 2023 21:54:19 +0100 Subject: [PATCH 281/501] [ie/pornhub] Update access cookies for UK (#7591) Closes #7590 Authored by: zhong-yiyu --- yt_dlp/extractor/pornhub.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index f08414030b..999d038d47 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -62,6 +62,7 @@ def _real_initialize(self): def _set_age_cookies(self, host): self._set_cookie(host, 'age_verified', '1') self._set_cookie(host, 'accessAgeDisclaimerPH', '1') + self._set_cookie(host, 'accessAgeDisclaimerUK', '1') self._set_cookie(host, 'accessPH', '1') def _login(self, host): From bb5d84c9d2f1e978c3eddfb5ccbe138036682a36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C4=83n=20Anh?= <65241526+demon071@users.noreply.github.com> Date: Sun, 16 Jul 2023 04:03:23 +0700 Subject: [PATCH 282/501] [ie/facebook:reel] Fix extraction (#7564) Closes #7469 Authored by: demon071, bashonly --- yt_dlp/extractor/facebook.py | 83 +++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 9f4d3fb789..574f8e8c95 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -24,6 +24,7 @@ parse_count, parse_qs, qualities, + str_or_none, traverse_obj, try_get, url_or_none, @@ -90,16 +91,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt', - 'description': 'Asif Nawab Butt', + 'title': 'Asif', + 'description': '', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', + 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', + 'duration': 131.03, + 'concurrent_view_count': int, }, - 'expected_warnings': [ - 'title' - ] }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', @@ -151,7 +152,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '3f3798adb2b73423263e59376f1f5eb7', + 'md5': 'ca63897a90c9452efee5f8c40d080e25', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -162,6 +163,9 @@ class FacebookIE(InfoExtractor): 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', 'view_count': int, + 'uploader_id': '100059479812265', + 'concurrent_view_count': int, + 'duration': 44.478, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -170,12 +174,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео', + 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', 'description': 'Довгоочікуване відео', - 'timestamp': 1486648771, + 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': '100000948048708', + 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', + 'concurrent_view_count': int, + 'thumbnail': r're:^https?://.*', + 'view_count': int, + 'duration': 11736.446, }, 'params': { 'skip_download': True, @@ -192,9 +200,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'La Guía Del Varón', 'thumbnail': r're:^https?://.*', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', @@ -208,9 +214,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'Elisabeth Ahtn', 'uploader_id': '100013949973717', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -252,7 +256,11 @@ class FacebookIE(InfoExtractor): 'timestamp': 1527084179, 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', - 'uploader_id': '234218833769558', + 'uploader_id': '100066514874195', + 'duration': 4524.212, + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, }, 'params': { 'skip_download': True, @@ -262,8 +270,17 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 'info_dict': { 'id': '106560053808006', + 'ext': 'mp4', + 'title': 'Josef', + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, + 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', + 'timestamp': 1549275572, + 'duration': 3.413, + 'uploader': 'Josef Novak', + 'description': '', + 'upload_date': '20190204', }, - 'playlist_count': 2, }, { # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', @@ -276,6 +293,7 @@ class FacebookIE(InfoExtractor): 'id': '10157667649866271', }, 'playlist_count': 3, + 'skip': 'Requires logging in', }, { # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', @@ -497,6 +515,13 @@ def extract_relay_prefetched_data(_filter): entries = [] def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + reel_info = traverse_obj( + video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) + if reel_info: + video = video['creation_story'] + video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) + video.update(reel_info) formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), @@ -513,15 +538,15 @@ def parse_graphql_video(video): 'url': playable_url, }) extract_dash_manifest(video, formats) - v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, 'formats': formats, 'thumbnail': traverse_obj( video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), + 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), } process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) @@ -782,18 +807,18 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'md5': 'f13dd37f2633595982db5ed8765474d3', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', - 'description': 'md5:24ea7ef062215d295bdde64e778f5474', - 'uploader': 'Beast Camp Training', - 'uploader_id': '1738535909799870', - 'duration': 9.536, - 'thumbnail': r're:^https?://.*', + 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', + 'description': 'md5:22f03309b216ac84720183961441d8db', + 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'uploader_id': '100040874179269', + 'duration': 9.579, + 'timestamp': 1637502609, 'upload_date': '20211121', - 'timestamp': 1637502604, + 'thumbnail': r're:^https?://.*', } }] From 613dbce177d34ffc31053e8e01acf4bb107bcd1e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 15 Jul 2023 16:10:12 -0500 Subject: [PATCH 283/501] [ie/twitter:spaces] Fix format protocol (#7550) Closes #7536 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 38 ++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4015277a86..9d87dbc4be 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1499,6 +1499,38 @@ class TwitterSpacesIE(TwitterBaseIE): 'release_date': '20220807', }, 'params': {'skip_download': 'm3u8'}, + }, { + # post_live/TimedOut but downloadable + 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', + 'info_dict': { + 'id': '1vAxRAVQWONJl', + 'ext': 'm4a', + 'title': 'Framing Up FinOps: Billing Tools', + 'description': 'Twitter Space participated by rupa, Alfonso Hernandez', + 'uploader': 'Google Cloud', + 'uploader_id': 'googlecloud', + 'live_status': 'post_live', + 'timestamp': 1681409554, + 'upload_date': '20230413', + 'release_timestamp': 1681839000, + 'release_date': '20230418', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Needs ffmpeg as downloader, see: https://github.com/yt-dlp/yt-dlp/issues/7536 + 'url': 'https://twitter.com/i/spaces/1eaKbrQbjoRKX', + 'info_dict': { + 'id': '1eaKbrQbjoRKX', + 'ext': 'm4a', + 'title': 'あ', + 'description': 'Twitter Space participated by nobody yet', + 'uploader': '息根とめる🔪Twitchで復活', + 'uploader_id': 'tomeru_ikinone', + 'live_status': 'was_live', + 'timestamp': 1685617198, + 'upload_date': '20230601', + }, + 'params': {'skip_download': 'm3u8'}, }] SPACE_STATUS = { @@ -1555,9 +1587,9 @@ def _real_extract(self, url): source = traverse_obj( self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']), ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) - formats = self._extract_m3u8_formats( - source, metadata['media_key'], 'm4a', live=is_live, fatal=False, - headers={'Referer': 'https://twitter.com/'}) if source else [] + formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader + source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live, + headers={'Referer': 'https://twitter.com/'}, fatal=False) if source else [] for fmt in formats: fmt.update({'vcodec': 'none', 'acodec': 'aac'}) if not is_live: From 71baa490ebd3655746430f208a9b605d120cd315 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 20 Jul 2023 08:23:30 -0500 Subject: [PATCH 284/501] [networking] Fix POST requests with zero-length payloads (#7648) Bugfix for 227bf1a33be7b89cd7d44ad046844c4ccba104f4 Authored by: bashonly --- test/test_networking.py | 11 +++++++++++ yt_dlp/extractor/ettutv.py | 2 +- yt_dlp/networking/_urllib.py | 2 +- yt_dlp/networking/common.py | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index b60ed283be..3cf587a637 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -1280,6 +1280,17 @@ def test_content_type_header(self): req.data = b'test3' assert req.headers.get('Content-Type') == 'application/x-www-form-urlencoded' + def test_update_req(self): + req = Request('http://example.com') + assert req.data is None + assert req.method == 'GET' + assert 'Content-Type' not in req.headers + # Test that zero-byte payloads will be sent + req.update(data=b'') + assert req.data == b'' + assert req.method == 'POST' + assert req.headers.get('Content-Type') == 'application/x-www-form-urlencoded' + def test_proxies(self): req = Request(url='http://example.com', proxies={'http': 'http://127.0.0.1:8080'}) assert req.proxies == {'http': 'http://127.0.0.1:8080'} diff --git a/yt_dlp/extractor/ettutv.py b/yt_dlp/extractor/ettutv.py index 46d7255438..133b525556 100644 --- a/yt_dlp/extractor/ettutv.py +++ b/yt_dlp/extractor/ettutv.py @@ -41,7 +41,7 @@ def _real_extract(self, url): 'device': 'desktop', }) - stream_response = self._download_json(player_settings['streamAccess'], video_id, data={}) + stream_response = self._download_json(player_settings['streamAccess'], video_id, data=b'') formats, subtitles = self._extract_m3u8_formats_and_subtitles( stream_response['data']['stream'], video_id, 'mp4') diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 2c5f09872a..8a76676d94 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -315,7 +315,7 @@ def get_method(self): def update_Request(req, url=None, data=None, headers=None, query=None): req_headers = req.headers.copy() req_headers.update(headers or {}) - req_data = data or req.data + req_data = data if data is not None else req.data req_url = update_url_query(url or req.get_full_url(), query) req_get_method = req.get_method() if req_get_method == 'HEAD': diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 458eca39f8..61196406dc 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -425,7 +425,7 @@ def headers(self, new_headers: Mapping): raise TypeError('headers must be a mapping') def update(self, url=None, data=None, headers=None, query=None): - self.data = data or self.data + self.data = data if data is not None else self.data self.headers.update(headers or {}) self.url = update_url_query(url or self.url, query or {}) From 75dc8e673b481a82d0688aeec30f6c65d82bb359 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 20 Jul 2023 08:31:17 -0500 Subject: [PATCH 285/501] [networking] Fix `--legacy-server-connect` (#7645) Bugfix for 227bf1a33be7b89cd7d44ad046844c4ccba104f4 Authored by: bashonly --- test/test_networking.py | 2 +- yt_dlp/YoutubeDL.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 3cf587a637..d4eba2a5df 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -1152,7 +1152,7 @@ def test_build_handler_params(self): 'debug_printtraffic': True, 'compat_opts': ['no-certifi'], 'nocheckcertificate': True, - 'legacy_server_connect': True, + 'legacyserverconnect': True, }) as ydl: rh = self.build_handler(ydl) assert rh.headers.get('test') == 'testtest' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 1a2f42fe9a..324f9e99c4 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4097,7 +4097,7 @@ def build_request_director(self, handlers): 'verbose': 'debug_printtraffic', 'source_address': 'source_address', 'timeout': 'socket_timeout', - 'legacy_ssl_support': 'legacy_server_connect', + 'legacy_ssl_support': 'legacyserverconnect', 'enable_file_urls': 'enable_file_urls', 'client_cert': { 'client_certificate': 'client_certificate', From af86873218c24c3859ccf575a87f2b00a73b49d0 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Thu, 20 Jul 2023 08:40:31 -0500 Subject: [PATCH 286/501] [utils] Improve `parse_duration` Authored by: bashonly --- test/test_utils.py | 2 ++ yt_dlp/utils/_utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 768edfd0cf..b36bc04c2f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -655,6 +655,8 @@ def test_parse_duration(self): self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88) self.assertEqual(parse_duration('01:02:03:050'), 3723.05) self.assertEqual(parse_duration('103:050'), 103.05) + self.assertEqual(parse_duration('1HR 3MIN'), 3780) + self.assertEqual(parse_duration('2hrs 3mins'), 7380) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 2e619f9ea4..abae0f17e4 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2021,7 +2021,7 @@ def parse_duration(s): )? T)? (?: - (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s* + (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s* )? (?: (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s* From f4ea501551526ebcb54d19b84cf0ebe798583a85 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 20 Jul 2023 09:02:50 -0500 Subject: [PATCH 287/501] [ie/MagellanTV] Add extractor (#7616) Closes #7529 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/magellantv.py | 50 +++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 yt_dlp/extractor/magellantv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2af99b3dad..bcd8dbe006 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1012,6 +1012,7 @@ LyndaCourseIE ) from .m6 import M6IE +from .magellantv import MagellanTVIE from .magentamusik360 import MagentaMusik360IE from .mailru import ( MailRuIE, diff --git a/yt_dlp/extractor/magellantv.py b/yt_dlp/extractor/magellantv.py new file mode 100644 index 0000000000..0947a450a6 --- /dev/null +++ b/yt_dlp/extractor/magellantv.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import parse_age_limit, parse_duration, traverse_obj + + +class MagellanTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magellantv\.com/(?:watch|video)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.magellantv.com/watch/my-dads-on-death-row?type=v', + 'info_dict': { + 'id': 'my-dads-on-death-row', + 'ext': 'mp4', + 'title': 'My Dad\'s On Death Row', + 'description': 'md5:33ba23b9f0651fc4537ed19b1d5b0d7a', + 'duration': 3780.0, + 'age_limit': 14, + 'tags': ['Justice', 'Reality', 'United States', 'True Crime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.magellantv.com/video/james-bulger-the-new-revelations', + 'info_dict': { + 'id': 'james-bulger-the-new-revelations', + 'ext': 'mp4', + 'title': 'James Bulger: The New Revelations', + 'description': 'md5:7b97922038bad1d0fe8d0470d8a189f2', + 'duration': 2640.0, + 'age_limit': 0, + 'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'], + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext']['video']['detail'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('metadata', 'description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('ratingCategory', {parse_age_limit}), + 'tags': ('tags', ..., {str}), + }), + } From 65cfa2b057d7946fbe322155a778fe206556d0c6 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 20 Jul 2023 09:15:21 -0500 Subject: [PATCH 288/501] [ie/MuseAI] Add extractor (#7614) Closes #7543 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/museai.py | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 yt_dlp/extractor/museai.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bcd8dbe006..ae73a9f960 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1142,6 +1142,7 @@ ) from .muenchentv import MuenchenTVIE from .murrtube import MurrtubeIE, MurrtubeUserIE +from .museai import MuseAIIE from .musescore import MuseScoreIE from .musicdex import ( MusicdexSongIE, diff --git a/yt_dlp/extractor/museai.py b/yt_dlp/extractor/museai.py new file mode 100644 index 0000000000..7f66928c72 --- /dev/null +++ b/yt_dlp/extractor/museai.py @@ -0,0 +1,112 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + js_to_json, + traverse_obj, + url_or_none, +) + + +class MuseAIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?muse\.ai/(?:v|embed)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://muse.ai/embed/YdTWvUW', + 'md5': 'f994f9a38be1c3aaf9e37cbd7d76fe7c', + 'info_dict': { + 'id': 'YdTWvUW', + 'ext': 'mp4', + 'title': '2023-05-28-Grabien-1941111 (1)', + 'description': '', + 'uploader': 'Today News Africa', + 'uploader_id': 'TodayNewsAfrica', + 'upload_date': '20230528', + 'timestamp': 1685285044, + 'duration': 1291.3, + 'view_count': int, + 'availability': 'public', + }, + }, { + 'url': 'https://muse.ai/v/gQ4gGAA-0756', + 'md5': '52dbfc78e865e56dc19a1715badc35e8', + 'info_dict': { + 'id': 'gQ4gGAA', + 'ext': 'mp4', + 'title': '0756', + 'description': 'md5:0ca1483f9aac423e9a96ad00bb3a0785', + 'uploader': 'Aerial.ie', + 'uploader_id': 'aerial', + 'upload_date': '20210306', + 'timestamp': 1615072842, + 'duration': 21.4, + 'view_count': int, + 'availability': 'public', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://muse.ai/docs', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'docs', + 'title': 'muse.ai | docs', + 'description': 'md5:6c0293431481582739c82ee8902687fa', + 'age_limit': 0, + 'thumbnail': 'https://muse.ai/static/imgs/poster-img-docs.png', + }, + 'params': {'allowed_extractors': ['all', '-html5']}, + }] + _EMBED_REGEX = [r'<iframe[^>]*\bsrc=["\'](?P<url>https://muse\.ai/embed/\w+)'] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for embed_id in re.findall(r'<script>[^<]*\bMusePlayer\(\{[^}<]*\bvideo:\s*["\'](\w+)["\']', webpage): + yield f'https://muse.ai/embed/{embed_id}' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://muse.ai/embed/{video_id}', video_id) + data = self._search_json( + r'player\.setData\(', webpage, 'player data', video_id, transform_source=js_to_json) + + source_url = data['url'] + if not url_or_none(source_url): + raise ExtractorError('Unable to extract video URL') + + formats = [{ + 'url': source_url, + 'format_id': 'source', + 'quality': 1, + **traverse_obj(data, { + 'ext': ('filename', {determine_ext}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }] + if source_url.endswith('/data'): + base_url = f'{source_url[:-5]}/videos' + formats.extend(self._extract_m3u8_formats( + f'{base_url}/hls.m3u8', video_id, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + f'{base_url}/dash.mpd', video_id, mpd_id='dash', fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('tcreated', {int_or_none}), + 'uploader': ('owner_name', {str}), + 'uploader_id': ('owner_username', {str}), + 'view_count': ('views', {int_or_none}), + 'age_limit': ('mature', {lambda x: 18 if x else None}), + 'availability': ('visibility', {lambda x: x if x in ('private', 'unlisted') else 'public'}), + }), + } From 9b16762f48914de9ac914601769c76668e433325 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Thu, 20 Jul 2023 22:09:52 +0200 Subject: [PATCH 289/501] [ie/crunchyroll] Remove initial state extraction (#7632) Authored by: Grub4K --- yt_dlp/extractor/crunchyroll.py | 73 +++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index adb3d5dcf6..ee34aced55 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -27,11 +27,24 @@ class CrunchyrollBaseIE(InfoExtractor): _AUTH_HEADERS = None _API_ENDPOINT = None _BASIC_AUTH = None - _QUERY = {} + _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q') + _LOCALE_LOOKUP = { + 'ar': 'ar-SA', + 'de': 'de-DE', + '': 'en-US', + 'es': 'es-419', + 'es-es': 'es-ES', + 'fr': 'fr-FR', + 'it': 'it-IT', + 'pt-br': 'pt-BR', + 'pt-pt': 'pt-PT', + 'ru': 'ru-RU', + 'hi': 'hi-IN', + } @property def is_logged_in(self): - return self._get_cookies(self._BASE_URL).get('etp_rt') + return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) def _perform_login(self, username, password): if self.is_logged_in: @@ -62,49 +75,49 @@ def _perform_login(self, username, password): if not self.is_logged_in: raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _update_query(self, lang): - if lang in CrunchyrollBaseIE._QUERY: - return - - webpage = self._download_webpage( - f'{self._BASE_URL}/{lang}', None, note=f'Retrieving main page (lang={lang or None})') - - initial_state = self._search_json(r'__INITIAL_STATE__\s*=', webpage, 'initial state', None) - CrunchyrollBaseIE._QUERY[lang] = traverse_obj(initial_state, { - 'locale': ('localization', 'locale'), - }) or None - - if CrunchyrollBaseIE._BASIC_AUTH: - return - - app_config = self._search_json(r'__APP_CONFIG__\s*=', webpage, 'app config', None) - cx_api_param = app_config['cxApiParams']['accountAuthClientId' if self.is_logged_in else 'anonClientId'] - self.write_debug(f'Using cxApiParam={cx_api_param}') - CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() - def _update_auth(self): if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): return - assert CrunchyrollBaseIE._BASIC_AUTH, '_update_query needs to be called at least one time beforehand' + if not CrunchyrollBaseIE._BASIC_AUTH: + cx_api_param = self._CLIENT_ID[self.is_logged_in] + self.write_debug(f'Using cxApiParam={cx_api_param}') + CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() + grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' - auth_response = self._download_json( - f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + try: + auth_response = self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', + headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 403: + raise ExtractorError( + 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' + 'and your browser\'s User-Agent (with --user-agent)', expected=True) + raise CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) + def _locale_from_language(self, language): + config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) + return config_locale[0] if config_locale else self._LOCALE_LOOKUP.get(language) + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): - self._update_query(lang) self._update_auth() if not endpoint.startswith('/'): endpoint = f'/{endpoint}' + query = query.copy() + locale = self._locale_from_language(lang) + if locale: + query['locale'] = locale + return self._download_json( f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', - headers=CrunchyrollBaseIE._AUTH_HEADERS, query={**CrunchyrollBaseIE._QUERY[lang], **query}) + headers=CrunchyrollBaseIE._AUTH_HEADERS, query=query) def _call_api(self, path, internal_id, lang, note='api', query={}): if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): @@ -206,7 +219,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'''(?x) https?://(?:beta\.|www\.)?crunchyroll\.com/ - (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + (?:(?P<lang>\w{2}(?:-\w{2})?)/)? watch/(?!concert|musicvideo)(?P<id>\w+)''' _TESTS = [{ # Premium only @@ -304,7 +317,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): }, 'playlist_mincount': 5, }, { - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'url': 'https://www.crunchyroll.com/de/watch/GY2P1Q98Y', 'only_matching': True, }, { 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', From e57eb98222d29cc4c09ee975d3c492274a6e5be3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 21 Jul 2023 21:32:49 -0500 Subject: [PATCH 290/501] [fd/external] Fix ffmpeg input from stdin (#7655) Bugfix for 1ceb657bdd254ad961489e5060f2ccc7d556b729 Authored by: bashonly --- test/test_downloader_external.py | 5 +++++ yt_dlp/downloader/external.py | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py index d3d74df043..62f7d45d49 100644 --- a/test/test_downloader_external.py +++ b/test/test_downloader_external.py @@ -129,6 +129,11 @@ def test_make_cmd(self): 'ffmpeg', '-y', '-hide_banner', '-cookies', 'test=ytdlp; path=/; domain=.example.com;\r\n', '-i', 'http://www.example.com/', '-c', 'copy', '-f', 'mp4', 'file:test']) + # Test with non-url input (ffmpeg reads from stdin '-' for websockets) + downloader._call_downloader('test', {'url': 'x', 'ext': 'mp4'}) + self.assertEqual(self._args, [ + 'ffmpeg', '-y', '-hide_banner', '-i', 'x', '-c', 'copy', '-f', 'mp4', 'file:test']) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 4f52f6e8df..d3c3eba888 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -559,12 +559,13 @@ def _call_downloader(self, tmpfilename, info_dict): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): - cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) + is_http = re.match(r'^https?://', fmt['url']) + cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] if cookies: args.extend(['-cookies', ''.join( f'{cookie.name}={cookie.value}; path={cookie.path}; domain={cookie.domain};\r\n' for cookie in cookies)]) - if fmt.get('http_headers') and re.match(r'^https?://', fmt['url']): + if fmt.get('http_headers') and is_http: # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. args.extend(['-headers', ''.join(f'{key}: {val}\r\n' for key, val in fmt['http_headers'].items())]) From 9f66247289b9f8ecf931833b3f5f127274dd2161 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 19 Jul 2023 07:11:52 +0530 Subject: [PATCH 291/501] [ie/abematv] Temporary fix for protocol handler Closes #7622 --- yt_dlp/extractor/abematv.py | 79 +++++-------------------------------- 1 file changed, 10 insertions(+), 69 deletions(-) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 98ece8da7d..163b83c6da 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -27,74 +27,18 @@ update_url_query, ) -# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) - -def add_opener(ydl, handler): - ''' Add a handler for opening URLs, like _download_webpage ''' +def add_opener(ydl, handler): # FIXME: Create proper API in .networking + """Add a handler for opening URLs, like _download_webpage""" # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, urllib.request.OpenerDirector) - ydl._opener.add_handler(handler) - - -def remove_opener(ydl, handler): - ''' - Remove handler(s) for opening URLs - @param handler Either handler object itself or handler type. - Specifying handler type will remove all handler which isinstance returns True. - ''' - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - opener = ydl._opener - assert isinstance(ydl._opener, urllib.request.OpenerDirector) - if isinstance(handler, (type, tuple)): - find_cp = lambda x: isinstance(x, handler) - else: - find_cp = lambda x: x is handler - - removed = [] - for meth in dir(handler): - if meth in ["redirect_request", "do_open", "proxy_open"]: - # oops, coincidental match - continue - - i = meth.find("_") - protocol = meth[:i] - condition = meth[i + 1:] - - if condition.startswith("error"): - j = condition.find("_") + i + 1 - kind = meth[j + 1:] - try: - kind = int(kind) - except ValueError: - pass - lookup = opener.handle_error.get(protocol, {}) - opener.handle_error[protocol] = lookup - elif condition == "open": - kind = protocol - lookup = opener.handle_open - elif condition == "response": - kind = protocol - lookup = opener.process_response - elif condition == "request": - kind = protocol - lookup = opener.process_request - else: - continue - - handlers = lookup.setdefault(kind, []) - if handlers: - handlers[:] = [x for x in handlers if not find_cp(x)] - - removed.append(x for x in handlers if find_cp(x)) - - if removed: - for x in opener.handlers: - if find_cp(x): - x.add_parent(None) - opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + rh = ydl._request_director.handlers['Urllib'] + if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: + return + opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies) + assert isinstance(opener, urllib.request.OpenerDirector) + opener.add_handler(handler) + rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') class AbemaLicenseHandler(urllib.request.BaseHandler): @@ -140,7 +84,7 @@ def abematv_license_open(self, url): ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) return urllib.response.addinfourl(io.BytesIO(response_data), headers={ - 'Content-Length': len(response_data), + 'Content-Length': str(len(response_data)), }, url=url, code=200) @@ -212,10 +156,7 @@ def _get_device_token(self): }) AbemaTVBaseIE._USERTOKEN = user_data['token'] - # don't allow adding it 2 times or more, though it's guarded - remove_opener(self._downloader, AbemaLicenseHandler) add_opener(self._downloader, AbemaLicenseHandler(self)) - return self._USERTOKEN def _get_media_token(self, invalidate=False, to_show=True): From a264433c9fba147ecae2420091614186cfeeb895 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 07:52:55 +0530 Subject: [PATCH 292/501] [outtmpl] Fix replacement for `playlist_index` --- test/test_YoutubeDL.py | 1 + yt_dlp/YoutubeDL.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b4f770ca58..c54c3ea5ce 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -687,6 +687,7 @@ def test(tmpl, expected, *, info=None, **params): test('%(duration_string)s', ('27:46:40', '27-46-40')) test('%(resolution)s', '1080p') test('%(playlist_index|)s', '001') + test('%(playlist_index&{}!)s', '001!') test('%(playlist_autonumber)s', '02') test('%(autonumber)s', '00001') test('%(autonumber+2)03d', '005', autonumber_start=3) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 324f9e99c4..dae29d9f95 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1300,16 +1300,16 @@ def create_key(outer_mobj): else: break - fmt = outer_mobj.group('format') - if fmt == 's' and value is not None and last_field in field_size_compat_map.keys(): - fmt = f'0{field_size_compat_map[last_field]:d}d' - if None not in (value, replacement): try: value = replacement_formatter.format(replacement, value) except ValueError: value, default = None, na + fmt = outer_mobj.group('format') + if fmt == 's' and last_field in field_size_compat_map.keys() and isinstance(value, int): + fmt = f'0{field_size_compat_map[last_field]:d}d' + flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' if value is None: From 994f7ef8e6003f4b7b258528755d0b6adcc31714 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 07:54:25 +0530 Subject: [PATCH 293/501] [ie/generic] Fix generic title for embeds Closes #7067 --- yt_dlp/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 8fa4c62217..f5c59a0930 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2562,7 +2562,7 @@ def _real_extract(self, url): self._downloader.write_debug('Looking for embeds') embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) if len(embeds) == 1: - return {**info_dict, **embeds[0]} + return merge_dicts(embeds[0], info_dict) elif embeds: return self.playlist_result(embeds, **info_dict) raise UnsupportedError(url) From 81b4712bca608b9015aa68a4d96661d56e9cb894 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 08:28:38 +0530 Subject: [PATCH 294/501] [extractor] Fix `--load-pages` --- yt_dlp/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d449187764..64a280dc05 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1042,7 +1042,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.full_url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: From e0c4db04dc82a699bdabd9821ddc239ebe17d30a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 08:56:45 +0530 Subject: [PATCH 295/501] [compat] Add `types.NoneType` --- yt_dlp/compat/types.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 yt_dlp/compat/types.py diff --git a/yt_dlp/compat/types.py b/yt_dlp/compat/types.py new file mode 100644 index 0000000000..ae70245642 --- /dev/null +++ b/yt_dlp/compat/types.py @@ -0,0 +1,12 @@ +# flake8: noqa: F405 +from types import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'types') +del passthrough_module + +try: + NoneType # >= 3.10 +except NameError: + NoneType = type(None) From 62b5c94cadaa5f596dc1a7083db9db12efe357be Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 09:08:12 +0530 Subject: [PATCH 296/501] [cleanup] Misc fixes Closes #7528 --- Changelog.md | 4 ++-- README.md | 4 ++-- devscripts/changelog_override.json | 7 ++++++- devscripts/make_changelog.py | 4 ++-- test/test_YoutubeDL.py | 3 +-- test/test_YoutubeDLCookieJar.py | 12 ++++++------ yt_dlp/YoutubeDL.py | 13 +++++++------ yt_dlp/compat/_legacy.py | 4 ++-- yt_dlp/cookies.py | 10 +++++----- yt_dlp/downloader/external.py | 2 +- yt_dlp/networking/_urllib.py | 4 ++-- yt_dlp/networking/common.py | 2 +- 12 files changed, 37 insertions(+), 32 deletions(-) diff --git a/Changelog.md b/Changelog.md index 622ae68b9b..32cdaca2ab 100644 --- a/Changelog.md +++ b/Changelog.md @@ -10,7 +10,7 @@ #### Important changes - Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj) - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains - Cookies are scoped when passed to external downloaders - - Add `cookie` field to info.json and deprecate `http_headers.Cookie` + - Add `cookies` field to info.json and deprecate `http_headers.Cookie` #### Core changes - [Allow extractors to mark formats as potentially DRM](https://github.com/yt-dlp/yt-dlp/commit/bc344cd456380999c1ee74554dfd432a38f32ec7) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan) @@ -51,7 +51,7 @@ #### Downloader changes - **http**: [Avoid infinite loop when no data is received](https://github.com/yt-dlp/yt-dlp/commit/662ef1e910b72e57957f06589925b2332ba52821) by [pukkandan](https://github.com/pukkandan) #### Misc. changes -- [Add CodeQL workflow](https://github.com/yt-dlp/yt-dlp/commit/6355b5f1e1e8e7f4ef866d71d51e03baf0e82f17) ([#7497](https://github.com/yt-dlp/yt-dlp/issues/7497)) by [pukkandan](https://github.com/pukkandan) +- [Add CodeQL workflow](https://github.com/yt-dlp/yt-dlp/commit/6355b5f1e1e8e7f4ef866d71d51e03baf0e82f17) ([#7497](https://github.com/yt-dlp/yt-dlp/issues/7497)) by [jorgectf](https://github.com/jorgectf) - **cleanup**: Miscellaneous: [337734d](https://github.com/yt-dlp/yt-dlp/commit/337734d4a8a6500bc65434843db346b5cbd05e81) by [pukkandan](https://github.com/pukkandan) - **docs**: [Minor fixes](https://github.com/yt-dlp/yt-dlp/commit/b532a3481046e1eabb6232ee8196fb696c356ff6) by [pukkandan](https://github.com/pukkandan) - **make_changelog**: [Skip reverted commits](https://github.com/yt-dlp/yt-dlp/commit/fa44802809d189fca0f4782263d48d6533384503) by [pukkandan](https://github.com/pukkandan) diff --git a/README.md b/README.md index 655cd41f52..ff88f817cf 100644 --- a/README.md +++ b/README.md @@ -1569,7 +1569,7 @@ ## Sorting Formats - `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other). If `--prefer-free-formats` is used, the order changes to `ogg` > `opus` > `webm` > `mp3` > `m4a` > `aac` - `ext`: Equivalent to `vext,aext` - `filesize`: Exact filesize, if known in advance - - `fs_approx`: Approximate filesize calculated from the manifests + - `fs_approx`: Approximate filesize - `size`: Exact filesize if available, otherwise approximate filesize - `height`: Height of video - `width`: Width of video @@ -1580,7 +1580,7 @@ ## Sorting Formats - `tbr`: Total average bitrate in KBit/s - `vbr`: Average video bitrate in KBit/s - `abr`: Average audio bitrate in KBit/s - - `br`: Equivalent to using `tbr,vbr,abr` + - `br`: Average bitrate in KBit/s, `tbr`/`vbr`/`abr` - `asr`: Audio sample rate in Hz **Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names. diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index f573a74630..d03db3f232 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -63,6 +63,11 @@ { "action": "add", "when": "1ceb657bdd254ad961489e5060f2ccc7d556b729", - "short": "[priority] Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj)\n - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains\n - Cookies are scoped when passed to external downloaders\n - Add `cookie` field to info.json and deprecate `http_headers.Cookie`" + "short": "[priority] Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj)\n - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains\n - Cookies are scoped when passed to external downloaders\n - Add `cookies` field to info.json and deprecate `http_headers.Cookie`" + }, + { + "action": "change", + "when": "b03fa7834579a01cc5fba48c0e73488a16683d48", + "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b" } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 157c661267..84f72d52f3 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -53,10 +53,10 @@ def commit_lookup(cls): 'cookies', 'core', 'dependencies', + 'formats', 'jsinterp', 'networking', 'outtmpl', - 'formats', 'plugins', 'update', 'upstream', @@ -254,7 +254,7 @@ class CommitRange: (?:\ \((?P<issues>\#\d+(?:,\ \#\d+)*)\))? ''', re.VERBOSE | re.DOTALL) EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) - REVERT_RE = re.compile(r'(?i:Revert)\s+([\da-f]{40})') + REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})') FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert)\s+([\da-f]{40})') UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index c54c3ea5ce..ab1250848b 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -26,7 +26,6 @@ ) from yt_dlp.utils.traversal import traverse_obj - TEST_URL = 'http://localhost/sample.mp4' @@ -687,7 +686,7 @@ def test(tmpl, expected, *, info=None, **params): test('%(duration_string)s', ('27:46:40', '27-46-40')) test('%(resolution)s', '1080p') test('%(playlist_index|)s', '001') - test('%(playlist_index&{}!)s', '001!') + test('%(playlist_index&{}!)s', '1!') test('%(playlist_autonumber)s', '02') test('%(autonumber)s', '00001') test('%(autonumber+2)03d', '005', autonumber_start=3) diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 0b7a0acdb5..fdb9baee59 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -17,10 +17,10 @@ class TestYoutubeDLCookieJar(unittest.TestCase): def test_keep_session_cookies(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) + cookiejar.load() tf = tempfile.NamedTemporaryFile(delete=False) try: - cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True) + cookiejar.save(filename=tf.name) temp = tf.read().decode() self.assertTrue(re.search( r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp)) @@ -32,7 +32,7 @@ def test_keep_session_cookies(self): def test_strip_httponly_prefix(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) + cookiejar.load() def assert_cookie_has_value(key): self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') @@ -42,20 +42,20 @@ def assert_cookie_has_value(key): def test_malformed_cookies(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/malformed_cookies.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) + cookiejar.load() # Cookies should be empty since all malformed cookie file entries # will be ignored self.assertFalse(cookiejar._cookies) def test_get_cookie_header(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) + cookiejar.load() header = cookiejar.get_cookie_header('https://www.foobar.foobar') self.assertIn('HTTPONLY_COOKIE', header) def test_get_cookies_for_url(self): cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') - cookiejar.load(ignore_discard=True, ignore_expires=True) + cookiejar.load() cookies = cookiejar.get_cookies_for_url('https://www.foobar.foobar/') self.assertEqual(len(cookies), 2) cookies = cookiejar.get_cookies_for_url('https://foobar.foobar/') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index dae29d9f95..c9cf07e530 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -572,7 +572,7 @@ class YoutubeDL: 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', - 'preference', 'language', 'language_preference', 'quality', 'source_preference', + 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' } @@ -621,7 +621,8 @@ def __init__(self, params=None, auto_init=True): if self.params.get('no_color'): if self.params.get('color') is not None: - self.report_warning('Overwriting params from "color" with "no_color"') + self.params.setdefault('_warnings', []).append( + 'Overwriting params from "color" with "no_color"') self.params['color'] = 'no_color' term_allow_color = os.environ.get('TERM', '').lower() != 'dumb' @@ -949,7 +950,7 @@ def __enter__(self): def save_cookies(self): if self.params.get('cookiefile') is not None: - self.cookiejar.save(ignore_discard=True, ignore_expires=True) + self.cookiejar.save() def __exit__(self, *args): self.restore_console_title() @@ -3290,7 +3291,7 @@ def existing_video_file(*filepaths): fd, success = None, True if info_dict.get('protocol') or info_dict.get('url'): fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') - if fd is not FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( + if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( info_dict.get('section_start') or info_dict.get('section_end')): msg = ('This format cannot be partially downloaded' if FFmpegFD.available() else 'You have requested downloading the video partially, but ffmpeg is not installed') @@ -3451,7 +3452,7 @@ def ffmpeg_fixup(cndn, msg, cls): postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any(( isinstance(pp, FFmpegVideoConvertorPP) and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None) - ) for pp in self._pps['post_process']) + ) for pp in self._pps['post_process']) or fd == FFmpegFD if not postprocessed_by_ffmpeg: ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash', @@ -4031,7 +4032,7 @@ def _opener(self): """ Get a urllib OpenerDirector from the Urllib handler (deprecated). """ - self.deprecation_warning('YoutubeDL._opener() is deprecated, use YoutubeDL.urlopen()') + self.deprecation_warning('YoutubeDL._opener is deprecated, use YoutubeDL.urlopen()') handler = self._request_director.handlers['Urllib'] return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies) diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index 912907a021..90ccf0f14a 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -16,12 +16,12 @@ import shutil import socket import struct +import subprocess import tokenize import urllib.error import urllib.parse import urllib.request import xml.etree.ElementTree as etree -from subprocess import DEVNULL # isort: split import asyncio # noqa: F401 @@ -85,7 +85,7 @@ def compat_setenv(key, value, env=os.environ): compat_Struct = struct.Struct compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack -compat_subprocess_get_DEVNULL = lambda: DEVNULL +compat_subprocess_get_DEVNULL = lambda: subprocess.DEVNULL compat_tokenize_tokenize = tokenize.tokenize compat_urllib_error = urllib.error compat_urllib_HTTPError = urllib.error.HTTPError diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 16f1918e6a..80428c747b 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -97,7 +97,7 @@ def load_cookies(cookie_file, browser_specification, ydl): jar = YoutubeDLCookieJar(cookie_file) if not is_filename or os.access(cookie_file, os.R_OK): - jar.load(ignore_discard=True, ignore_expires=True) + jar.load() cookie_jars.append(jar) return _merge_cookie_jars(cookie_jars) @@ -1213,7 +1213,7 @@ def open(self, file, *, write=False): file.truncate(0) yield file - def _really_save(self, f, ignore_discard=False, ignore_expires=False): + def _really_save(self, f, ignore_discard, ignore_expires): now = time.time() for cookie in self: if (not ignore_discard and cookie.discard @@ -1234,7 +1234,7 @@ def _really_save(self, f, ignore_discard=False, ignore_expires=False): name, value ))) - def save(self, filename=None, *args, **kwargs): + def save(self, filename=None, ignore_discard=True, ignore_expires=True): """ Save cookies to a file. Code is taken from CPython 3.6 @@ -1253,9 +1253,9 @@ def save(self, filename=None, *args, **kwargs): with self.open(filename, write=True) as f: f.write(self._HEADER) - self._really_save(f, *args, **kwargs) + self._really_save(f, ignore_discard, ignore_expires) - def load(self, filename=None, ignore_discard=False, ignore_expires=False): + def load(self, filename=None, ignore_discard=True, ignore_expires=True): """Load cookies from a file.""" if filename is None: if self.filename is not None: diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index d3c3eba888..4ce8a3bf7d 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -137,7 +137,7 @@ def _write_cookies(self): self._cookies_tempfile = tmp_cookies.name self.to_screen(f'[download] Writing temporary cookies file to "{self._cookies_tempfile}"') # real_download resets _cookies_tempfile; if it's None then save() will write to cookiejar.filename - self.ydl.cookiejar.save(self._cookies_tempfile, ignore_discard=True, ignore_expires=True) + self.ydl.cookiejar.save(self._cookies_tempfile) return self.ydl.cookiejar.filename or self._cookies_tempfile def _call_downloader(self, tmpfilename, info_dict): diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 8a76676d94..ff3a22c8c1 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -28,7 +28,7 @@ make_socks_proxy_opts, select_proxy, ) -from .common import Features, RequestHandler, Response, register +from .common import Features, RequestHandler, Response, register_rh from .exceptions import ( CertificateVerifyError, HTTPError, @@ -372,7 +372,7 @@ def handle_response_read_exceptions(e): raise TransportError(cause=e) from e -@register +@register_rh class UrllibRH(RequestHandler, InstanceStoreMixin): _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp') _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h') diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 61196406dc..7f74579780 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -105,7 +105,7 @@ def send(self, request: Request) -> Response: _REQUEST_HANDLERS = {} -def register(handler): +def register_rh(handler): """Register a RequestHandler class""" assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler' assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered' From e7057383380d7d53815f8feaf90ca3dcbde88983 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 09:43:51 +0530 Subject: [PATCH 297/501] [ie/unsupported] List more sites with DRM Closes #7323, #3072, #5740, #5767, #6125 --- yt_dlp/extractor/unsupported.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index 1bc49786f9..bbcbf3acbb 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -42,6 +42,12 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'vootkids\.com', r'nowtv\.it/watch', r'tv\.apple\.com', + r'primevideo\.com', + r'hulu\.com', + r'resource\.inkryptvideos\.com', + r'joyn\.de', + r'amazon\.(?:\w{2}\.)?\w+/gp/video', + r'music\.amazon\.(?:\w{2}\.)?\w+', ) _TESTS = [{ @@ -111,6 +117,30 @@ class KnownDRMIE(UnsupportedInfoExtractor): # https://github.com/yt-dlp/yt-dlp/issues/5557 'url': 'https://tv.apple.com/it/show/loot---una-fortuna/umc.cmc.5erbujil1mpazuerhr1udnk45?ctx_brand=tvs.sbd.4000', 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/3072 + 'url': 'https://www.joyn.de/play/serien/clannad/1-1-wo-die-kirschblueten-fallen', + 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/7323 + 'url': 'https://music.amazon.co.jp/albums/B088Y368TK', + 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/7323 + 'url': 'https://www.amazon.co.jp/gp/video/detail/B09X5HBYRS/', + 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/6125 + 'url': 'https://www.primevideo.com/region/eu/detail/0H3DDB4KBJFNDCKKLHNRLRLVKQ/ref=atv_br_def_r_br_c_unkc_1_10', + 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/5740 + 'url': 'https://resource.inkryptvideos.com/v2-a83ns52/iframe/index.html#video_id=7999ea0f6e03439eb40d056258c2d736&otp=xxx', + 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/5767 + 'url': 'https://www.hulu.com/movie/anthem-6b25fac9-da2b-45a3-8e09-e4156b0471cc', + 'only_matching': True, }] def _real_extract(self, url): From 25b6e8f94679b4458550702b46e61249b875a4fd Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 10:17:36 +0530 Subject: [PATCH 298/501] Fix e0c4db04dc82a699bdabd9821ddc239ebe17d30a for pypy --- yt_dlp/compat/types.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/compat/types.py b/yt_dlp/compat/types.py index ae70245642..4aa3b0efdd 100644 --- a/yt_dlp/compat/types.py +++ b/yt_dlp/compat/types.py @@ -7,6 +7,7 @@ del passthrough_module try: - NoneType # >= 3.10 -except NameError: + # NB: pypy has builtin NoneType, so checking NameError won't work + from types import NoneType # >= 3.10 +except ImportError: NoneType = type(None) From a250b247334ce9f641e709cbb64974da6034a2b3 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 22 Jul 2023 17:56:53 +0530 Subject: [PATCH 299/501] [compat] Ensure submodules are imported correctly Closes #7663 --- test/test_compat.py | 6 +++--- yt_dlp/compat/__init__.py | 7 ++----- yt_dlp/compat/_deprecated.py | 8 ++++++++ yt_dlp/compat/urllib/__init__.py | 3 +++ yt_dlp/utils/__init__.py | 6 ++---- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 003a97abf7..71ca7f99f1 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -9,15 +9,16 @@ import struct -import urllib.parse from yt_dlp import compat +from yt_dlp.compat import urllib # isort: split from yt_dlp.compat import ( compat_etree_fromstring, compat_expanduser, compat_urllib_parse_unquote, compat_urllib_parse_urlencode, ) +from yt_dlp.compat.urllib.request import getproxies class TestCompat(unittest.TestCase): @@ -28,8 +29,7 @@ def test_compat_passthrough(self): with self.assertWarns(DeprecationWarning): compat.WINDOWS_VT_MODE - # TODO: Test submodule - # compat.asyncio.events # Must not raise error + self.assertEqual(urllib.request.getproxies, getproxies) with self.assertWarns(DeprecationWarning): compat.compat_pycrypto_AES # Must not raise error diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index a41a80ebb6..832a9138d3 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -1,14 +1,11 @@ import os import sys -import warnings import xml.etree.ElementTree as etree -from ._deprecated import * # noqa: F401, F403 from .compat_utils import passthrough_module -# XXX: Implement this the same way as other DeprecationWarnings without circular import -passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( - DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5)) +passthrough_module(__name__, '._deprecated') +del passthrough_module # HTMLParseError has been deprecated in Python 3.3 and removed in diff --git a/yt_dlp/compat/_deprecated.py b/yt_dlp/compat/_deprecated.py index 14d37b2367..607bae9999 100644 --- a/yt_dlp/compat/_deprecated.py +++ b/yt_dlp/compat/_deprecated.py @@ -1,4 +1,12 @@ """Deprecated - New code should avoid these""" +import warnings + +from .compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6)) +del passthrough_module import base64 import urllib.error diff --git a/yt_dlp/compat/urllib/__init__.py b/yt_dlp/compat/urllib/__init__.py index 6b6b8e103d..b27cc6133c 100644 --- a/yt_dlp/compat/urllib/__init__.py +++ b/yt_dlp/compat/urllib/__init__.py @@ -1,6 +1,9 @@ # flake8: noqa: F405 from urllib import * # noqa: F403 +del request +from . import request # noqa: F401 + from ..compat_utils import passthrough_module passthrough_module(__name__, 'urllib') diff --git a/yt_dlp/utils/__init__.py b/yt_dlp/utils/__init__.py index 0b00adddb4..c267e326f0 100644 --- a/yt_dlp/utils/__init__.py +++ b/yt_dlp/utils/__init__.py @@ -1,6 +1,4 @@ -# flake8: noqa: F401, F403 -import warnings - +# flake8: noqa: F403 from ..compat.compat_utils import passthrough_module passthrough_module(__name__, '._deprecated') @@ -9,4 +7,4 @@ # isort: off from .traversal import * from ._utils import * -from ._utils import _configuration_args, _get_exe_version_output +from ._utils import _configuration_args, _get_exe_version_output # noqa: F401 From 11de6fec9c9b8d34d1f90c8e6218ec58a3471b58 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 22 Jul 2023 08:10:25 -0500 Subject: [PATCH 300/501] [ie/PatreonCampaign] Fix extraction (#7664) Authored by: bashonly --- yt_dlp/extractor/patreon.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 447087436d..9316789df2 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -2,21 +2,21 @@ from .common import InfoExtractor from .vimeo import VimeoIE - from ..compat import compat_urllib_parse_unquote from ..networking.exceptions import HTTPError from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, clean_html, determine_ext, - ExtractorError, int_or_none, - KNOWN_EXTENSIONS, mimetype2ext, parse_iso8601, str_or_none, traverse_obj, try_get, url_or_none, + urljoin, ) @@ -404,8 +404,8 @@ def _entries(self, campaign_id): posts_json = self._call_api('posts', campaign_id, query=params, note='Downloading posts page %d' % page) cursor = traverse_obj(posts_json, ('meta', 'pagination', 'cursors', 'next')) - for post in posts_json.get('data') or []: - yield self.url_result(url_or_none(traverse_obj(post, ('attributes', 'patreon_url'))), 'Patreon') + for post_url in traverse_obj(posts_json, ('data', ..., 'attributes', 'patreon_url')): + yield self.url_result(urljoin('https://www.patreon.com/', post_url), PatreonIE) if cursor is None: break From 86aea0d3a213da3be1da638b9b828e6f0ee1d59f Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 23 Jul 2023 17:17:15 +1200 Subject: [PATCH 301/501] [networking] Add strict Request extension checking (#7604) Authored by: coletdjnz Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> --- test/test_networking.py | 54 ++++++++++++++++++++++++------------ yt_dlp/networking/_urllib.py | 5 ++++ yt_dlp/networking/common.py | 34 +++++++++++------------ 3 files changed, 58 insertions(+), 35 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index d4eba2a5df..1bd6afc88b 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -804,10 +804,10 @@ def test_httplib_validation_errors(self, handler): assert not isinstance(exc_info.value, TransportError) -def run_validation(handler, fail, req, **handler_kwargs): +def run_validation(handler, error, req, **handler_kwargs): with handler(**handler_kwargs) as rh: - if fail: - with pytest.raises(UnsupportedRequest): + if error: + with pytest.raises(error): rh.validate(req) else: rh.validate(req) @@ -824,6 +824,9 @@ class NoCheckRH(ValidationRH): _SUPPORTED_PROXY_SCHEMES = None _SUPPORTED_URL_SCHEMES = None + def _check_extensions(self, extensions): + extensions.clear() + class HTTPSupportedRH(ValidationRH): _SUPPORTED_URL_SCHEMES = ('http',) @@ -834,26 +837,26 @@ class HTTPSupportedRH(ValidationRH): ('https', False, {}), ('data', False, {}), ('ftp', False, {}), - ('file', True, {}), + ('file', UnsupportedRequest, {}), ('file', False, {'enable_file_urls': True}), ]), (NoCheckRH, [('http', False, {})]), - (ValidationRH, [('http', True, {})]) + (ValidationRH, [('http', UnsupportedRequest, {})]) ] PROXY_SCHEME_TESTS = [ # scheme, expected to fail ('Urllib', [ ('http', False), - ('https', True), + ('https', UnsupportedRequest), ('socks4', False), ('socks4a', False), ('socks5', False), ('socks5h', False), - ('socks', True), + ('socks', UnsupportedRequest), ]), (NoCheckRH, [('http', False)]), - (HTTPSupportedRH, [('http', True)]), + (HTTPSupportedRH, [('http', UnsupportedRequest)]), ] PROXY_KEY_TESTS = [ @@ -863,8 +866,22 @@ class HTTPSupportedRH(ValidationRH): ('unrelated', False), ]), (NoCheckRH, [('all', False)]), - (HTTPSupportedRH, [('all', True)]), - (HTTPSupportedRH, [('no', True)]), + (HTTPSupportedRH, [('all', UnsupportedRequest)]), + (HTTPSupportedRH, [('no', UnsupportedRequest)]), + ] + + EXTENSION_TESTS = [ + ('Urllib', [ + ({'cookiejar': 'notacookiejar'}, AssertionError), + ({'cookiejar': CookieJar()}, False), + ({'timeout': 1}, False), + ({'timeout': 'notatimeout'}, AssertionError), + ({'unsupported': 'value'}, UnsupportedRequest), + ]), + (NoCheckRH, [ + ({'cookiejar': 'notacookiejar'}, False), + ({'somerandom': 'test'}, False), # but any extension is allowed through + ]), ] @pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [ @@ -907,15 +924,16 @@ def test_empty_proxy(self, handler): @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1']) @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) def test_missing_proxy_scheme(self, handler, proxy_url): - run_validation(handler, True, Request('http://', proxies={'http': 'example.com'})) + run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': 'example.com'})) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) - def test_cookiejar_extension(self, handler): - run_validation(handler, True, Request('http://', extensions={'cookiejar': 'notacookiejar'})) - - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) - def test_timeout_extension(self, handler): - run_validation(handler, True, Request('http://', extensions={'timeout': 'notavalidtimeout'})) + @pytest.mark.parametrize('handler,extensions,fail', [ + (handler_tests[0], extensions, fail) + for handler_tests in EXTENSION_TESTS + for extensions, fail in handler_tests[1] + ], indirect=['handler']) + def test_extension(self, handler, extensions, fail): + run_validation( + handler, fail, Request('http://', extensions=extensions)) def test_invalid_request_type(self): rh = self.ValidationRH(logger=FakeLogger()) diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index ff3a22c8c1..3fe5fa52ea 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -385,6 +385,11 @@ def __init__(self, *, enable_file_urls: bool = False, **kwargs): if self.enable_file_urls: self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file') + def _check_extensions(self, extensions): + super()._check_extensions(extensions) + extensions.pop('cookiejar', None) + extensions.pop('timeout', None) + def _create_instance(self, proxies, cookiejar): opener = urllib.request.OpenerDirector() handlers = [ diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 7f74579780..ab26a06282 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -21,6 +21,7 @@ TransportError, UnsupportedRequest, ) +from ..compat.types import NoneType from ..utils import ( bug_reports_message, classproperty, @@ -147,6 +148,7 @@ class RequestHandler(abc.ABC): a proxy url with an url scheme not in this list will raise an UnsupportedRequest. - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum. + The above may be set to None to disable the checks. Parameters: @@ -169,9 +171,14 @@ class RequestHandler(abc.ABC): Requests may have additional optional parameters defined as extensions. RequestHandler subclasses may choose to support custom extensions. + If an extension is supported, subclasses should extend _check_extensions(extensions) + to pop and validate the extension. + - Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised. + The following extensions are defined for RequestHandler: - - `cookiejar`: Cookiejar to use for this request - - `timeout`: socket timeout to use for this request + - `cookiejar`: Cookiejar to use for this request. + - `timeout`: socket timeout to use for this request. + To enable these, add extensions.pop('<extension>', None) to _check_extensions Apart from the url protocol, proxies dict may contain the following keys: - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol. @@ -263,26 +270,19 @@ def _check_proxies(self, proxies): if scheme not in self._SUPPORTED_PROXY_SCHEMES: raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"') - def _check_cookiejar_extension(self, extensions): - if not extensions.get('cookiejar'): - return - if not isinstance(extensions['cookiejar'], CookieJar): - raise UnsupportedRequest('cookiejar is not a CookieJar') - - def _check_timeout_extension(self, extensions): - if extensions.get('timeout') is None: - return - if not isinstance(extensions['timeout'], (float, int)): - raise UnsupportedRequest('timeout is not a float or int') - def _check_extensions(self, extensions): - self._check_cookiejar_extension(extensions) - self._check_timeout_extension(extensions) + """Check extensions for unsupported extensions. Subclasses should extend this.""" + assert isinstance(extensions.get('cookiejar'), (CookieJar, NoneType)) + assert isinstance(extensions.get('timeout'), (float, int, NoneType)) def _validate(self, request): self._check_url_scheme(request) self._check_proxies(request.proxies or self.proxies) - self._check_extensions(request.extensions) + extensions = request.extensions.copy() + self._check_extensions(extensions) + if extensions: + # TODO: add support for optional extensions + raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}') @wrap_request_errors def validate(self, request: Request): From 39837ae3199aa934299badbd0d63243ed639e6c8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 23 Jul 2023 18:29:45 -0500 Subject: [PATCH 302/501] [ie/triller] Fix unlisted video extraction (#7670) Authored by: bashonly --- yt_dlp/extractor/triller.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/triller.py b/yt_dlp/extractor/triller.py index c5d01c8271..56e51fea8f 100644 --- a/yt_dlp/extractor/triller.py +++ b/yt_dlp/extractor/triller.py @@ -66,13 +66,6 @@ def _get_comments(self, video_id, limit=15): 'timestamp': ('timestamp', {unified_timestamp}), })) - def _check_user_info(self, user_info): - if user_info.get('private') and not user_info.get('followed_by_me'): - raise ExtractorError('This video is private', expected=True) - elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'): - raise ExtractorError('The author of the video is blocked', expected=True) - return user_info - def _parse_video_info(self, video_info, username, user_id, display_id=None): video_id = str(video_info['id']) display_id = display_id or video_info.get('video_uuid') @@ -231,8 +224,6 @@ def _real_extract(self, url): f'{self._API_BASE_URL}/api/videos/{display_id}', display_id, headers=self._API_HEADERS)['videos'][0] - self._check_user_info(video_info.get('user') or {}) - return self._parse_video_info(video_info, username, None, display_id) @@ -287,9 +278,14 @@ def _entries(self, username, user_id, limit=6): def _real_extract(self, url): username = self._match_id(url) - user_info = self._check_user_info(self._download_json( + user_info = traverse_obj(self._download_json( f'{self._API_BASE_URL}/api/users/by_username/{username}', - username, note='Downloading user info', headers=self._API_HEADERS)['user']) + username, note='Downloading user info', headers=self._API_HEADERS), ('user', {dict})) or {} + + if user_info.get('private') and user_info.get('followed_by_me') not in (True, 'true'): + raise ExtractorError('This user profile is private', expected=True) + elif traverse_obj(user_info, (('blocked_by_user', 'blocking_user'), {bool}), get_all=False): + raise ExtractorError('The author of the video is blocked', expected=True) user_id = str_or_none(user_info.get('user_id')) if not user_id: From 550e65410a7a1b105923494ac44460a4dc1a15d9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 23 Jul 2023 19:09:52 -0500 Subject: [PATCH 303/501] [ie] Extract subtitles from SMIL manifests (#7667) Authored by: bashonly, pukkandan --- yt_dlp/extractor/common.py | 46 +++++++++++++++++++-------------- yt_dlp/extractor/livestream.py | 5 ++-- yt_dlp/extractor/mediaset.py | 6 +++-- yt_dlp/extractor/nbc.py | 1 - yt_dlp/extractor/theplatform.py | 4 +-- 5 files changed, 34 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 64a280dc05..b69ac1d653 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2248,18 +2248,10 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4 if res is False: assert not fatal return [], {} - smil, urlh = res - smil_url = urlh.url - namespace = self._parse_smil_namespace(smil) - - fmts = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subs = self._parse_smil_subtitles( - smil, namespace=namespace) - - return fmts, subs + return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params, + namespace=self._parse_smil_namespace(smil)) def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) @@ -2285,9 +2277,8 @@ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) - formats = self._parse_smil_formats( + formats, subtitles = self._parse_smil_formats_and_subtitles( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) video_id = os.path.splitext(url_basename(smil_url))[0] title = None @@ -2326,7 +2317,14 @@ def _parse_smil_namespace(self, smil): return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats(self, *args, **kwargs): + fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -2334,7 +2332,7 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para base = b break - formats = [] + formats, subtitles = [], {} rtmp_count = 0 http_count = 0 m3u8_count = 0 @@ -2382,8 +2380,9 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + self._merge_subtitles(m3u8_subs, target=subtitles) if len(m3u8_formats) == 1: m3u8_count += 1 m3u8_formats[0].update({ @@ -2404,11 +2403,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subtitles) elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) + ism_formats, ism_subs = self._extract_ism_formats_and_subtitles( + src_url, video_id, ism_id='mss', fatal=False) + formats.extend(ism_formats) + self._merge_subtitles(ism_subs, target=subtitles) elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ @@ -2439,7 +2442,10 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para 'format_note': 'SMIL storyboards', }) - return formats + smil_subs = self._parse_smil_subtitles(smil, namespace=namespace) + self._merge_subtitles(smil_subs, target=subtitles) + + return formats, subtitles def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): urls = [] diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py index 692d6ab3a6..a05a0fa9ec 100644 --- a/yt_dlp/extractor/livestream.py +++ b/yt_dlp/extractor/livestream.py @@ -80,7 +80,8 @@ class LivestreamIE(InfoExtractor): }] _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base_ele = find_xpath_attr( smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' @@ -104,7 +105,7 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para 'tbr': tbr, 'preference': -1000, # Strictly inferior than all other formats? }) - return formats + return formats, {} def _extract_video_info(self, video_data): video_id = compat_str(video_data['id']) diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 1fa5299141..e3b728dcae 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -154,10 +154,12 @@ class MediasetIE(ThePlatformBaseIE): } }] - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) - return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + return super(MediasetIE, self)._parse_smil_formats_and_subtitles( + smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) def _check_drm_formats(self, tp_formats, video_id): has_nondrm, drm_manifest = False, '' diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 299b051745..b3c28ab55d 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -131,7 +131,6 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'age_limit': 0, 'thumbnail': r're:https?://.+\.jpg', }, - 'expected_warnings': ['Ignoring subtitle tracks'], 'params': { 'skip_download': 'm3u8', }, diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 537f6f6cd0..8307b912dd 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -45,7 +45,7 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d raise ExtractorError( error_element.attrib['abstract'], expected=True) - smil_formats = self._parse_smil_formats( + smil_formats, subtitles = self._parse_smil_formats_and_subtitles( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com @@ -65,8 +65,6 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d formats.append(_format) - subtitles = self._parse_smil_subtitles(meta, default_ns) - return formats, subtitles def _download_theplatform_metadata(self, path, video_id): From 95abea9a03289da1384e5bda3d590223ccc0a238 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Tue, 25 Jul 2023 07:18:52 +1200 Subject: [PATCH 304/501] [test] Fix `httplib_validation_errors` test for old Python versions (#7677) Fixes https://github.com/yt-dlp/yt-dlp/issues/7674 Authored by: coletdjnz --- test/test_networking.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 1bd6afc88b..dbe28359be 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -785,22 +785,31 @@ def test_verify_cert_error_text(self, handler): validate_and_send(rh, Request(f'https://127.0.0.1:{self.https_port}/headers')) @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) - def test_httplib_validation_errors(self, handler): + @pytest.mark.parametrize('req,match,version_check', [ + # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 + # bpo-39603: Check implemented in 3.7.9+, 3.8.5+ + ( + Request('http://127.0.0.1', method='GET\n'), + 'method can\'t contain control characters', + lambda v: v < (3, 7, 9) or (3, 8, 0) <= v < (3, 8, 5) + ), + # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1265 + # bpo-38576: Check implemented in 3.7.8+, 3.8.3+ + ( + Request('http://127.0.0. 1', method='GET'), + 'URL can\'t contain control characters', + lambda v: v < (3, 7, 8) or (3, 8, 0) <= v < (3, 8, 3) + ), + # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1288C31-L1288C50 + (Request('http://127.0.0.1', headers={'foo\n': 'bar'}), 'Invalid header name', None), + ]) + def test_httplib_validation_errors(self, handler, req, match, version_check): + if version_check and version_check(sys.version_info): + pytest.skip(f'Python {sys.version} version does not have the required validation for this test.') + with handler() as rh: - - # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 - with pytest.raises(RequestError, match='method can\'t contain control characters') as exc_info: - validate_and_send(rh, Request('http://127.0.0.1', method='GET\n')) - assert not isinstance(exc_info.value, TransportError) - - # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1265 - with pytest.raises(RequestError, match='URL can\'t contain control characters') as exc_info: - validate_and_send(rh, Request('http://127.0.0. 1', method='GET\n')) - assert not isinstance(exc_info.value, TransportError) - - # https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1288C31-L1288C50 - with pytest.raises(RequestError, match='Invalid header name') as exc_info: - validate_and_send(rh, Request('http://127.0.0.1', headers={'foo\n': 'bar'})) + with pytest.raises(RequestError, match=match) as exc_info: + validate_and_send(rh, req) assert not isinstance(exc_info.value, TransportError) From dae349da97cafe7357106a8f3187fd48a2ad1210 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Thu, 27 Jul 2023 09:53:22 -0500 Subject: [PATCH 305/501] [ie/WrestleUniversePPV] Fix HLS AES key extraction Fix bug in ef8fb7f029b816dfc95600727d84400591a3b5c5 Closes #7708 Authored by: bashonly --- yt_dlp/extractor/wrestleuniverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 99a8f01200..dd12804db3 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -300,7 +300,7 @@ def _real_extract(self, url): info['hls_aes'] = { 'key': hls_aes_key, 'iv': traverse_obj(video_data, ('hls', 'iv', {decrypt})), - }, + } elif traverse_obj(video_data, ('hls', 'encryptType', {int})): self.report_warning('HLS AES-128 key was not found in API response') From bbeacff7fcaa3b521066088a5ccbf34ef5070d1d Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 28 Jul 2023 02:56:02 +1200 Subject: [PATCH 306/501] [networking] Ignore invalid proxies in env (#7704) Authored by: coletdjnz --- test/test_networking.py | 8 +++++--- yt_dlp/networking/common.py | 10 +++++++--- yt_dlp/utils/networking.py | 8 +++++++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index dbe28359be..f0938ab91c 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -930,10 +930,10 @@ def test_empty_proxy(self, handler): run_validation(handler, False, Request('http://', proxies={'http': None})) run_validation(handler, False, Request('http://'), proxies={'http': None}) - @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1']) + @pytest.mark.parametrize('proxy_url', ['//example.com', 'example.com', '127.0.0.1', '/a/b/c']) @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) - def test_missing_proxy_scheme(self, handler, proxy_url): - run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': 'example.com'})) + def test_invalid_proxy_url(self, handler, proxy_url): + run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url})) @pytest.mark.parametrize('handler,extensions,fail', [ (handler_tests[0], extensions, fail) @@ -1126,9 +1126,11 @@ def test_legacy_server_connect_error(self): ('http', '__noproxy__', None), ('no', '127.0.0.1,foo.bar', '127.0.0.1,foo.bar'), ('https', 'example.com', 'http://example.com'), + ('https', '//example.com', 'http://example.com'), ('https', 'socks5://example.com', 'socks5h://example.com'), ('http', 'socks://example.com', 'socks4://example.com'), ('http', 'socks4://example.com', 'socks4://example.com'), + ('unrelated', '/bad/proxy', '/bad/proxy'), # clean_proxies should ignore bad proxies ]) def test_clean_proxy(self, proxy_key, proxy_url, expected): # proxies should be cleaned in urlopen() diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index ab26a06282..3164df49b4 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -262,9 +262,13 @@ def _check_proxies(self, proxies): # Skip proxy scheme checks continue - # Scheme-less proxies are not supported - if urllib.request._parse_proxy(proxy_url)[0] is None: - raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme') + try: + if urllib.request._parse_proxy(proxy_url)[0] is None: + # Scheme-less proxies are not supported + raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme') + except ValueError as e: + # parse_proxy may raise on some invalid proxy urls such as "/a/b/c" + raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}') scheme = urllib.parse.urlparse(proxy_url).scheme.lower() if scheme not in self._SUPPORTED_PROXY_SCHEMES: diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index ac355ddc85..e6515ec8ee 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -98,7 +98,13 @@ def clean_proxies(proxies: dict, headers: HTTPHeaderDict): continue if proxy_url is not None: # Ensure proxies without a scheme are http. - proxy_scheme = urllib.request._parse_proxy(proxy_url)[0] + try: + proxy_scheme = urllib.request._parse_proxy(proxy_url)[0] + except ValueError: + # Ignore invalid proxy URLs. Sometimes these may be introduced through environment + # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`. + # If the proxy is going to be used, the Request Handler proxy validation will handle it. + continue if proxy_scheme is None: proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//') From c03a58ec9933e4a42c2d8fa80b8a0ddb2cde64e6 Mon Sep 17 00:00:00 2001 From: Amirreza Aflakparast <84932095+AmirAflak@users.noreply.github.com> Date: Fri, 28 Jul 2023 22:21:16 +0330 Subject: [PATCH 307/501] [ie/MotorTrendOnDemand] Update `_VALID_URL` (#7683) Closes #7680 Authored by: AmirAflak --- yt_dlp/extractor/dplay.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 6404752f7e..363b4bec9e 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -746,7 +746,7 @@ class MotorTrendIE(DiscoveryPlusBaseIE): class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): - _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?motortrend(?:ondemand\.com|\.com/plus)/detail' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', 'info_dict': { @@ -767,6 +767,25 @@ class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): 'upload_date': '20140101', 'tags': [], }, + }, { + 'url': 'https://www.motortrend.com/plus/detail/roadworthy-rescues-teaser-trailer/4922860/', + 'info_dict': { + 'id': '4922860', + 'ext': 'mp4', + 'title': 'Roadworthy Rescues | Teaser Trailer', + 'description': 'Derek Bieri helps Freiburger and Finnegan with their \'68 big-block Dart.', + 'display_id': 'roadworthy-rescues-teaser-trailer/4922860', + 'creator': 'Originals', + 'series': 'Roadworthy Rescues', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'upload_date': '20220907', + 'timestamp': 1662523200, + 'duration': 1066.356, + 'tags': [], + }, + }, { + 'url': 'https://www.motortrend.com/plus/detail/ugly-duckling/2450033/12439', + 'only_matching': True, }] _PRODUCT = 'MTOD' From a15fcd299e767a510debd8dc1646fe863b96ce0e Mon Sep 17 00:00:00 2001 From: nnoboa <90611593+nnoboa@users.noreply.github.com> Date: Fri, 28 Jul 2023 14:52:07 -0400 Subject: [PATCH 308/501] [ie/Wimbledon] Add extractor (#7551) Closes #7462 Authored by: nnoboa --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/wimbledon.py | 61 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 yt_dlp/extractor/wimbledon.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ae73a9f960..9d935a7d16 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2363,6 +2363,7 @@ from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE +from .wimbledon import WimbledonIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( diff --git a/yt_dlp/extractor/wimbledon.py b/yt_dlp/extractor/wimbledon.py new file mode 100644 index 0000000000..ee4872e88b --- /dev/null +++ b/yt_dlp/extractor/wimbledon.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + parse_duration, + traverse_obj, +) + + +class WimbledonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?wimbledon\.com/\w+/video/media/(?P<id>\d+).html' + _TESTS = [{ + 'url': 'https://www.wimbledon.com/en_GB/video/media/6330247525112.html', + 'info_dict': { + 'id': '6330247525112', + 'ext': 'mp4', + 'timestamp': 1687972186, + 'description': '', + 'thumbnail': r're:^https://[\w.-]+\.prod\.boltdns\.net/[^?#]+/image\.jpg', + 'upload_date': '20230628', + 'title': 'Coco Gauff | My Wimbledon Inspiration', + 'tags': ['features', 'trending', 'homepage'], + 'uploader_id': '3506358525001', + 'duration': 163072.0, + }, + }, { + 'url': 'https://www.wimbledon.com/en_GB/video/media/6308703111112.html', + 'info_dict': { + 'id': '6308703111112', + 'ext': 'mp4', + 'thumbnail': r're:^https://[\w.-]+\.prod\.boltdns\.net/[^?#]+/image\.jpg', + 'description': 'null', + 'upload_date': '20220629', + 'uploader_id': '3506358525001', + 'title': 'Roblox | WimbleWorld ', + 'duration': 101440.0, + 'tags': ['features', 'kids'], + 'timestamp': 1656500867, + }, + }, { + 'url': 'https://www.wimbledon.com/en_US/video/media/6309327106112.html', + 'only_matching': True, + }, { + 'url': 'https://www.wimbledon.com/es_Es/video/media/6308377909112.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json( + f'https://www.wimbledon.com/relatedcontent/rest/v2/wim_v1/en/content/wim_v1_{video_id}_en', video_id) + + return { + '_type': 'url_transparent', + 'url': f'http://players.brightcove.net/3506358525001/default_default/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew', + 'id': video_id, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('metadata', 'duration', {parse_duration}), + }), + } From 4bf912282a34b58b6b35d8f7e6be535770c89c76 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 29 Jul 2023 10:40:20 +1200 Subject: [PATCH 309/501] [networking] Remove dot segments during URL normalization (#7662) This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process. Closes #3355, #6526 Authored by: coletdjnz --- test/test_networking.py | 21 ++++++++++++++++++ test/test_utils.py | 42 ++++++++++++++++++++++++++++-------- yt_dlp/cookies.py | 6 +++--- yt_dlp/networking/_urllib.py | 7 +++--- yt_dlp/networking/common.py | 5 ++--- yt_dlp/utils/_legacy.py | 4 +++- yt_dlp/utils/_utils.py | 17 --------------- yt_dlp/utils/networking.py | 38 ++++++++++++++++++++++++++++++++ 8 files changed, 104 insertions(+), 36 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index f0938ab91c..684bf5f965 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -173,6 +173,12 @@ def do_GET(self): self.send_header('Location', self.path) self.send_header('Content-Length', '0') self.end_headers() + elif self.path == '/redirect_dotsegments': + self.send_response(301) + # redirect to /headers but with dot segments before + self.send_header('Location', '/a/b/./../../headers') + self.send_header('Content-Length', '0') + self.end_headers() elif self.path.startswith('/redirect_'): self._redirect() elif self.path.startswith('/method'): @@ -355,6 +361,21 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + def test_remove_dot_segments(self, handler): + with handler() as rh: + # This isn't a comprehensive test, + # but it should be enough to check whether the handler is removing dot segments + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/a/b/./../../headers')) + assert res.status == 200 + assert res.url == f'http://127.0.0.1:{self.http_port}/headers' + res.close() + + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_dotsegments')) + assert res.status == 200 + assert res.url == f'http://127.0.0.1:{self.http_port}/headers' + res.close() + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: diff --git a/test/test_utils.py b/test/test_utils.py index b36bc04c2f..453a01a1c2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -47,8 +47,6 @@ encode_base_n, encode_compat_str, encodeFilename, - escape_rfc3986, - escape_url, expand_path, extract_attributes, extract_basic_auth, @@ -132,7 +130,12 @@ xpath_text, xpath_with_ns, ) -from yt_dlp.utils.networking import HTTPHeaderDict +from yt_dlp.utils.networking import ( + HTTPHeaderDict, + escape_rfc3986, + normalize_url, + remove_dot_segments, +) class TestUtil(unittest.TestCase): @@ -933,24 +936,45 @@ def test_escape_rfc3986(self): self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar') self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar') - def test_escape_url(self): + def test_normalize_url(self): self.assertEqual( - escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'), + normalize_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'), 'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4' ) self.assertEqual( - escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'), + normalize_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'), 'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290' ) self.assertEqual( - escape_url('http://тест.рф/фрагмент'), + normalize_url('http://тест.рф/фрагмент'), 'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( - escape_url('http://тест.рф/абв?абв=абв#абв'), + normalize_url('http://тест.рф/абв?абв=абв#абв'), 'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) - self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + self.assertEqual(normalize_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + + self.assertEqual(normalize_url('http://www.example.com/../a/b/../c/./d.html'), 'http://www.example.com/a/c/d.html') + + def test_remove_dot_segments(self): + self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g') + self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6') + self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd') + self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/') + self.assertEqual(remove_dot_segments('/..'), '/') + self.assertEqual(remove_dot_segments('/./'), '/') + self.assertEqual(remove_dot_segments('/./a'), '/a') + self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi') + self.assertEqual(remove_dot_segments('/'), '/') + self.assertEqual(remove_dot_segments('/t'), '/t') + self.assertEqual(remove_dot_segments('t'), 't') + self.assertEqual(remove_dot_segments(''), '') + self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c') + self.assertEqual(remove_dot_segments('../a'), 'a') + self.assertEqual(remove_dot_segments('./a'), 'a') + self.assertEqual(remove_dot_segments('.'), '') + self.assertEqual(remove_dot_segments('////'), '////') def test_js_to_json_vars_strings(self): self.assertDictEqual( diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 80428c747b..157f5b0c2b 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -33,7 +33,6 @@ from .utils import ( Popen, error_to_str, - escape_url, expand_path, is_path_like, sanitize_url, @@ -42,6 +41,7 @@ write_string, ) from .utils._utils import _YDLLogger +from .utils.networking import normalize_url CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -1308,7 +1308,7 @@ def prepare_line(line): def get_cookie_header(self, url): """Generate a Cookie HTTP header for a given url""" - cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) + cookie_req = urllib.request.Request(normalize_url(sanitize_url(url))) self.add_cookie_header(cookie_req) return cookie_req.get_header('Cookie') @@ -1317,7 +1317,7 @@ def get_cookies_for_url(self, url): # Policy `_now` attribute must be set before calling `_cookies_for_request` # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360 self._policy._now = self._now = int(time.time()) - return self._cookies_for_request(urllib.request.Request(escape_url(sanitize_url(url)))) + return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url)))) def clear(self, *args, **kwargs): with contextlib.suppress(KeyError): diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 3fe5fa52ea..0c4794954b 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -41,7 +41,8 @@ from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError from ..socks import sockssocket -from ..utils import escape_url, update_url_query +from ..utils import update_url_query +from ..utils.networking import normalize_url SUPPORTED_ENCODINGS = ['gzip', 'deflate'] CONTENT_DECODE_ERRORS = [zlib.error, OSError] @@ -179,7 +180,7 @@ def http_request(self, req): # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) # the code of this workaround has been moved here from YoutubeDL.urlopen() url = req.get_full_url() - url_escaped = escape_url(url) + url_escaped = normalize_url(url) # Substitute URL if any change after escaping if url != url_escaped: @@ -212,7 +213,7 @@ def http_response(self, req, resp): if location: # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 location = location.encode('iso-8859-1').decode() - location_escaped = escape_url(location) + location_escaped = normalize_url(location) if location != location_escaped: del resp.headers['Location'] resp.headers['Location'] = location_escaped diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 3164df49b4..792e062fdf 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -27,10 +27,9 @@ classproperty, deprecation_warning, error_to_str, - escape_url, update_url_query, ) -from ..utils.networking import HTTPHeaderDict +from ..utils.networking import HTTPHeaderDict, normalize_url if typing.TYPE_CHECKING: RequestData = bytes | Iterable[bytes] | typing.IO | None @@ -372,7 +371,7 @@ def url(self, url): raise TypeError('url must be a string') elif url.startswith('//'): url = 'http:' + url - self._url = escape_url(url) + self._url = normalize_url(url) @property def method(self): diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 0770009717..dde02092c9 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -8,6 +8,8 @@ import zlib from ._utils import Popen, decode_base_n, preferredencoding +from .networking import escape_rfc3986 # noqa: F401 +from .networking import normalize_url as escape_url # noqa: F401 from .traversal import traverse_obj from ..dependencies import certifi, websockets from ..networking._helper import make_ssl_context @@ -197,7 +199,7 @@ def request_to_url(req): def sanitized_Request(url, *args, **kwargs): - from ..utils import escape_url, extract_basic_auth, sanitize_url + from ..utils import extract_basic_auth, sanitize_url url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) if auth_header is not None: headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index abae0f17e4..f5552ce802 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2464,23 +2464,6 @@ def lowercase_escape(s): s) -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = urllib.parse.urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - def parse_qs(url, **kwargs): return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index e6515ec8ee..bbcea84d2c 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict): if 'Youtubedl-No-Compression' in headers: # compat del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' + + +def remove_dot_segments(path): + # Implements RFC3986 5.2.4 remote_dot_segments + # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4 + # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263 + output = [] + segments = path.split('/') + for s in segments: + if s == '.': + continue + elif s == '..': + if output: + output.pop() + else: + output.append(s) + if not segments[0] and (not output or output[0]): + output.insert(0, '') + if segments[-1] in ('.', '..'): + output.append('') + return '/'.join(output) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def normalize_url(url): + """Normalize URL as suggested by RFC 3986""" + url_parsed = urllib.parse.urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(remove_dot_segments(url_parsed.path)), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() From ba06d77a316650ff057347d224b5afa8b203ad65 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 29 Jul 2023 18:20:42 +1200 Subject: [PATCH 310/501] [ie/youtube] Add `player_params` extractor arg (#7719) Authored by: coletdjnz --- README.md | 1 + yt_dlp/extractor/youtube.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ff88f817cf..b82d92a6ec 100644 --- a/README.md +++ b/README.md @@ -1802,6 +1802,7 @@ #### youtube * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2b3776aa1d..940a4995b5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3583,8 +3583,6 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - _PLAYER_PARAMS = 'CgIQBg==' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): session_index = self._extract_session_index(player_ytcfg, master_ytcfg) @@ -3597,7 +3595,11 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, 'videoId': video_id, } if _split_innertube_client(client)[0] == 'android': - yt_query['params'] = self._PLAYER_PARAMS + yt_query['params'] = 'CgIQBg==' + + pp_arg = self._configuration_arg('player_params', [None])[0] + if pp_arg: + yt_query['params'] = pp_arg yt_query.update(self._generate_player_context(sts)) return self._extract_response( @@ -4016,6 +4018,9 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} + pp = self._configuration_arg('player_params', [None])[0] + if pp: + query['pp'] = pp webpage = self._download_webpage( webpage_url, video_id, fatal=False, query=query) From 9a04113dfbb69b904e4e2bea736da293505786b8 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Fri, 28 Jul 2023 17:21:45 -0500 Subject: [PATCH 311/501] [ie/Reddit] Fix thumbnail extraction Authored by: bashonly --- yt_dlp/extractor/reddit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 13615e82f9..813e62874c 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -240,6 +240,7 @@ def add_thumbnail(src): 'url': unescapeHTML(thumbnail_url), 'width': int_or_none(src.get('width')), 'height': int_or_none(src.get('height')), + 'http_headers': {'Accept': '*/*'}, }) for image in try_get(data, lambda x: x['preview']['images']) or []: From 86eeb044c2342d68c6ef177577f87852e6badd85 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 29 Jul 2023 10:47:43 -0500 Subject: [PATCH 312/501] [ie/hotstar] Support `/clips/` URLs (#7710) Closes #7699 Authored by: bashonly --- yt_dlp/extractor/hotstar.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 324e9f51db..cdd9379416 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -84,7 +84,7 @@ class HotStarIE(HotStarBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) (?: - (?P<type>movies|sports|episode|(?P<tv>tv|shows))/ + (?P<type>movies|sports|clips|episode|(?P<tv>tv|shows))/ (?(tv)(?:[^/?#]+/){2}|[^?#]*) )? [^/?#]+/ @@ -142,6 +142,18 @@ class HotStarIE(HotStarBaseIE): 'duration': 1272, 'channel_id': 3, }, + }, { + 'url': 'https://www.hotstar.com/in/clips/e3-sairat-kahani-pyaar-ki/1000262286', + 'info_dict': { + 'id': '1000262286', + 'ext': 'mp4', + 'title': 'E3 - SaiRat, Kahani Pyaar Ki', + 'description': 'md5:e3b4b3203bc0c5396fe7d0e4948a6385', + 'episode': 'E3 - SaiRat, Kahani Pyaar Ki', + 'upload_date': '20210606', + 'timestamp': 1622943900, + 'duration': 5395, + }, }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, @@ -160,6 +172,7 @@ class HotStarIE(HotStarBaseIE): 'episode': 'episode', 'tv': 'episode', 'shows': 'episode', + 'clips': 'content', None: 'content', } From 127a22460658ac39cbe5c4b3fb88d578363e0dfa Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 29 Jul 2023 11:01:43 -0500 Subject: [PATCH 313/501] [ie/LBRY] Fix original format extraction (#7711) Authored by: bashonly --- yt_dlp/extractor/lbry.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 6af64f0df4..7dd3a48613 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -1,5 +1,6 @@ import functools import json +import re import urllib.parse from .common import InfoExtractor @@ -83,7 +84,7 @@ class LBRYIE(LBRYBaseIE): _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', - 'md5': 'fffd15d76062e9a985c22c7c7f2f4805', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', 'info_dict': { 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', 'ext': 'mp4', @@ -132,9 +133,8 @@ class LBRYIE(LBRYBaseIE): 'license': 'None', } }, { - # HLS 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', - 'md5': '25049011f3c8bc2f8b60ad88a031837e', + 'md5': 'c35fac796f62a14274b4dc2addb5d0ba', 'info_dict': { 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', 'ext': 'mp4', @@ -246,12 +246,13 @@ def _real_extract(self, url): streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] - # GET request returns original video/audio file if available + # GET request to v3 API returns original video/audio file if available + direct_url = re.sub(r'/api/v\d+/', '/api/v3/', streaming_url) ext = urlhandle_detect_ext(self._request_webpage( - streaming_url, display_id, 'Checking for original quality', headers=headers)) + direct_url, display_id, 'Checking for original quality', headers=headers)) if ext != 'm3u8': formats.append({ - 'url': streaming_url, + 'url': direct_url, 'format_id': 'original', 'quality': 1, **traverse_obj(result, ('value', { From b09bd0c19648f60c59fb980cd454cb0069959fb9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 29 Jul 2023 11:14:16 -0500 Subject: [PATCH 314/501] [ie/tiktok] Fix audio-only format extraction (#7712) Closes #6608 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 43 +++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 48de61f934..f14c4f9d6a 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -205,15 +205,16 @@ def parse_url_key(url_key): known_resolutions = {} - def mp3_meta(url): + def audio_meta(url): + ext = determine_ext(url, default_ext='m4a') return { 'format_note': 'Music track', - 'ext': 'mp3', - 'acodec': 'mp3', + 'ext': ext, + 'acodec': 'aac' if ext == 'm4a' else ext, 'vcodec': 'none', 'width': None, 'height': None, - } if determine_ext(url) == 'mp3' else {} + } if ext == 'mp3' or '-music-' in url else {} def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) @@ -231,7 +232,7 @@ def extract_addr(addr, add_meta={}): **add_meta, **parsed_meta, 'format_note': join_nonempty( add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '), - **mp3_meta(url), + **audio_meta(url), } for url in addr.get('url_list') or []] # Hack: Add direct video links first to prioritize them when removing duplicate formats @@ -527,6 +528,7 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, + 'params': {'skip_download': True}, # XXX: unable to download video data: HTTP Error 403: Forbidden }, { # Video without title and description 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', @@ -600,7 +602,7 @@ class TikTokIE(TikTokBaseIE): }, { # only available via web 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662', - 'md5': '8d8c0be14127020cd9f5def4a2e6b411', + 'md5': '6aba7fad816e8709ff2c149679ace165', 'info_dict': { 'id': '7206382937372134662', 'ext': 'mp4', @@ -637,8 +639,8 @@ class TikTokIE(TikTokBaseIE): 'uploader_id': '86328792343818240', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', - 'creator': 't8', - 'artist': 't8', + 'creator': 'tate mcrae', + 'artist': 'tate mcrae', 'track': 'original sound', 'upload_date': '20220609', 'timestamp': 1654805899, @@ -650,6 +652,31 @@ class TikTokIE(TikTokBaseIE): 'thumbnail': r're:^https://.+\.webp', }, 'params': {'format': 'bytevc1_1080p_808907-0'}, + }, { + # Slideshow, audio-only m4a format + 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594', + 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d', + 'info_dict': { + 'id': '7253412088251534594', + 'ext': 'm4a', + 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ', + 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ', + 'uploader': 'hara_yoimiya', + 'uploader_id': '6582536342634676230', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB', + 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB', + 'creator': 'лампочка', + 'artist': 'Øneheart', + 'album': 'watching the stars', + 'track': 'watching the stars', + 'upload_date': '20230708', + 'timestamp': 1688816612, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', From de20687ee6b742646128a7629b57096631a20619 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 28 Jul 2023 13:48:17 +0530 Subject: [PATCH 315/501] [test] Fix `test_load_certifi` Closes #7688, #7675 --- test/test_networking_utils.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index ef46f79ed0..dbf656090d 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -95,17 +95,20 @@ def test_make_socks_proxy_unknown(self): @pytest.mark.skipif(not certifi, reason='certifi is not installed') def test_load_certifi(self): + context_certifi = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context_certifi.load_verify_locations(cafile=certifi.where()) context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - context2 = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) ssl_load_certs(context, use_certifi=True) - context2.load_verify_locations(cafile=certifi.where()) - assert context.get_ca_certs() == context2.get_ca_certs() + assert context.get_ca_certs() == context_certifi.get_ca_certs() - # Test load normal certs - # XXX: could there be a case where system certs are the same as certifi? - context3 = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - ssl_load_certs(context3, use_certifi=False) - assert context3.get_ca_certs() != context.get_ca_certs() + context_default = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context_default.load_default_certs() + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_load_certs(context, use_certifi=False) + assert context.get_ca_certs() == context_default.get_ca_certs() + + if context_default.get_ca_certs() == context_certifi.get_ca_certs(): + pytest.skip('System uses certifi as default. The test is not valid') @pytest.mark.parametrize('method,status,expected', [ ('GET', 303, 'GET'), From 3f7965105d8d2048359e67c1e8b8ebd51588143b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 30 Jul 2023 03:18:10 +0530 Subject: [PATCH 316/501] [utils] HTTPHeaderDict: Handle byte values --- test/test_utils.py | 2 ++ yt_dlp/utils/networking.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 453a01a1c2..91e3ffd39e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2344,6 +2344,8 @@ def test_traverse_obj(self): def test_http_header_dict(self): headers = HTTPHeaderDict() + headers['ytdl-test'] = b'0' + self.assertEqual(list(headers.items()), [('Ytdl-Test', '0')]) headers['ytdl-test'] = 1 self.assertEqual(list(headers.items()), [('Ytdl-Test', '1')]) headers['Ytdl-test'] = '2' diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index bbcea84d2c..ba0493cc2b 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -65,6 +65,8 @@ def __init__(self, *args, **kwargs): self.update(kwargs) def __setitem__(self, key, value): + if isinstance(value, bytes): + value = value.decode('latin-1') super().__setitem__(key.title(), str(value)) def __getitem__(self, key): From 8cb7fc44db010e965d808ee679ef0725cb6e147c Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 30 Jul 2023 03:21:35 +0530 Subject: [PATCH 317/501] Fix `--check-formats` Bug in bc344cd456380999c1ee74554dfd432a38f32ec7 --- yt_dlp/YoutubeDL.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index c9cf07e530..6e8be40ba2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2339,13 +2339,13 @@ def _merge(formats_pair): return new_dict def _check_formats(formats): - if (self.params.get('check_formats') is not None + if self.params.get('check_formats') == 'selected': + yield from self._check_formats(formats) + return + elif (self.params.get('check_formats') is not None or self.params.get('allow_unplayable_formats')): yield from formats return - elif self.params.get('check_formats') == 'selected': - yield from self._check_formats(formats) - return for f in formats: if f.get('has_drm'): From 6148833f5ceb7674142ddb8d761ffe03cee7df69 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 30 Jul 2023 03:36:17 +0530 Subject: [PATCH 318/501] [cleanup] Misc --- test/test_YoutubeDL.py | 1 + test/test_networking.py | 8 +++++--- yt_dlp/YoutubeDL.py | 14 ++++++-------- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/lbry.py | 6 +++--- yt_dlp/extractor/netverse.py | 4 ++-- yt_dlp/extractor/ninenow.py | 2 +- yt_dlp/extractor/vk.py | 4 ++-- yt_dlp/extractor/wimbledon.py | 2 +- yt_dlp/networking/common.py | 9 +++++---- 10 files changed, 27 insertions(+), 25 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ab1250848b..3cfb61fb26 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -831,6 +831,7 @@ def expect_same_infodict(out): test('%(id&hi {:>10} {}|)s', 'hi 1234 1234') test(R'%(id&{0} {}|)s', 'NA') test(R'%(id&{0.1}|)s', 'NA') + test('%(height&{:,d})S', '1,080') # Laziness def gen(): diff --git a/test/test_networking.py b/test/test_networking.py index 684bf5f965..9c33b0d4c6 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -29,6 +29,7 @@ from http.cookiejar import CookieJar from test.helper import FakeYDL, http_server_port +from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.dependencies import brotli from yt_dlp.networking import ( HEADRequest, @@ -478,7 +479,7 @@ def test_request_cookie_header(self, handler): assert 'Cookie: test=test' not in res # Specified Cookie header should override global cookiejar for that request - cookiejar = http.cookiejar.CookieJar() + cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( version=0, name='test', value='ytdlp', port=None, port_specified=False, domain='127.0.0.1', domain_specified=True, domain_initial_dot=False, path='/', @@ -505,7 +506,7 @@ def test_incompleteread(self, handler): @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) def test_cookies(self, handler): - cookiejar = http.cookiejar.CookieJar() + cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( 0, 'test', 'ytdlp', None, False, '127.0.0.1', True, False, '/headers', True, False, None, False, None, None, {})) @@ -903,7 +904,8 @@ class HTTPSupportedRH(ValidationRH): EXTENSION_TESTS = [ ('Urllib', [ ({'cookiejar': 'notacookiejar'}, AssertionError), - ({'cookiejar': CookieJar()}, False), + ({'cookiejar': YoutubeDLCookieJar()}, False), + ({'cookiejar': CookieJar()}, AssertionError), ({'timeout': 1}, False), ({'timeout': 'notatimeout'}, AssertionError), ({'unsupported': 'value'}, UnsupportedRequest), diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 6e8be40ba2..db5932c443 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -256,8 +256,6 @@ class YoutubeDL: overwrites: Overwrite all video and metadata files if True, overwrite only non-video files if None and don't overwrite any file if False - For compatibility with youtube-dl, - "nooverwrites" may also be used instead playlist_items: Specific indices of playlist to download. playlistrandom: Download playlist items in random order. lazy_playlist: Process playlist entries as they are received. @@ -553,6 +551,7 @@ class YoutubeDL: You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) no_color: Same as `color='no_color'` + no_overwrites: Same as `overwrites=False` """ _NUMERIC_FIELDS = { @@ -604,6 +603,7 @@ def __init__(self, params=None, auto_init=True): self._playlist_level = 0 self._playlist_urls = set() self.cache = Cache(self) + self.__header_cookies = [] stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout self._out_files = Namespace( @@ -632,7 +632,7 @@ def process_color_policy(stream): policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) if policy in ('auto', None): return term_allow_color and supports_terminal_sequences(stream) - assert policy in ('always', 'never', 'no_color') + assert policy in ('always', 'never', 'no_color'), policy return {'always': True, 'never': False}.get(policy, policy) self._allow_colors = Namespace(**{ @@ -681,12 +681,10 @@ def process_color_policy(stream): self.params['compat_opts'] = set(self.params.get('compat_opts', ())) self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) - self.__header_cookies = [] self._load_cookies(self.params['http_headers'].get('Cookie')) # compat self.params['http_headers'].pop('Cookie', None) + self._request_director = self.build_request_director(_REQUEST_HANDLERS.values()) - self._request_director = self.build_request_director( - sorted(_REQUEST_HANDLERS.values(), key=lambda rh: rh.RH_NAME.lower())) if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() @@ -3977,7 +3975,7 @@ def get_encoding(stream): })) or 'none')) write_debug(f'Proxy map: {self.proxies}') - # write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers)}') + # write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['%s%s' % ( klass.__name__, '' if klass.__name__ == name else f' as {name}') @@ -4080,7 +4078,7 @@ def urlopen(self, req): def build_request_director(self, handlers): logger = _YDLLogger(self) - headers = self.params.get('http_headers').copy() + headers = self.params['http_headers'].copy() proxies = self.proxies.copy() clean_headers(headers) clean_proxies(proxies, headers) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b69ac1d653..7deab995c4 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -729,7 +729,7 @@ def extract(self, url): except UnsupportedError: raise except ExtractorError as e: - e.video_id = e.video_id or self.get_temp_id(url), + e.video_id = e.video_id or self.get_temp_id(url) e.ie = e.ie or self.IE_NAME, e.traceback = e.traceback or sys.exc_info()[2] raise diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 7dd3a48613..9a9f9256fe 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -248,9 +248,9 @@ def _real_extract(self, url): # GET request to v3 API returns original video/audio file if available direct_url = re.sub(r'/api/v\d+/', '/api/v3/', streaming_url) - ext = urlhandle_detect_ext(self._request_webpage( - direct_url, display_id, 'Checking for original quality', headers=headers)) - if ext != 'm3u8': + urlh = self._request_webpage( + direct_url, display_id, 'Checking for original quality', headers=headers, fatal=False) + if urlh and urlhandle_detect_ext(urlh) != 'm3u8': formats.append({ 'url': direct_url, 'format_id': 'original', diff --git a/yt_dlp/extractor/netverse.py b/yt_dlp/extractor/netverse.py index 398198a1b0..ef53e15da6 100644 --- a/yt_dlp/extractor/netverse.py +++ b/yt_dlp/extractor/netverse.py @@ -160,7 +160,7 @@ class NetverseIE(NetverseBaseIE): 'uploader': 'Net Prime', 'comment_count': int, }, - 'params':{ + 'params': { 'getcomments': True } }, { @@ -187,7 +187,7 @@ class NetverseIE(NetverseBaseIE): 'season': 'Season 1', 'comment_count': int, }, - 'params':{ + 'params': { 'getcomments': True } }] diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py index b970f8ccb5..c655b75f46 100644 --- a/yt_dlp/extractor/ninenow.py +++ b/yt_dlp/extractor/ninenow.py @@ -53,7 +53,7 @@ class NineNowIE(InfoExtractor): 'upload_date': '20210421', }, 'expected_warnings': ['Ignoring subtitle tracks'], - 'params':{ + 'params': { 'skip_download': True, } }] diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 6b7379d46c..915422817a 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -765,7 +765,7 @@ def _extract_common_meta(self, stream_info): class VKPlayIE(VKPlayBaseIE): - _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/]+)/record/(?P<id>[a-f0-9\-]+)' + _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/#?]+)/record/(?P<id>[a-f0-9-]+)' _TESTS = [{ 'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da', 'info_dict': { @@ -802,7 +802,7 @@ def _real_extract(self, url): class VKPlayLiveIE(VKPlayBaseIE): - _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/]+)/?(?:[#?]|$)' + _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/#?]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://vkplay.live/bayda', 'info_dict': { diff --git a/yt_dlp/extractor/wimbledon.py b/yt_dlp/extractor/wimbledon.py index ee4872e88b..0223e54f1d 100644 --- a/yt_dlp/extractor/wimbledon.py +++ b/yt_dlp/extractor/wimbledon.py @@ -6,7 +6,7 @@ class WimbledonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?wimbledon\.com/\w+/video/media/(?P<id>\d+).html' + _VALID_URL = r'https?://(?:www\.)?wimbledon\.com/\w+/video/media/(?P<id>\d+)\.html' _TESTS = [{ 'url': 'https://www.wimbledon.com/en_GB/video/media/6330247525112.html', 'info_dict': { diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 792e062fdf..8fba8c1c5a 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -12,7 +12,6 @@ from collections.abc import Iterable, Mapping from email.message import Message from http import HTTPStatus -from http.cookiejar import CookieJar from ._helper import make_ssl_context, wrap_request_errors from .exceptions import ( @@ -22,6 +21,7 @@ UnsupportedRequest, ) from ..compat.types import NoneType +from ..cookies import YoutubeDLCookieJar from ..utils import ( bug_reports_message, classproperty, @@ -194,7 +194,7 @@ def __init__( self, *, logger, # TODO(Grub4k): default logger headers: HTTPHeaderDict = None, - cookiejar: CookieJar = None, + cookiejar: YoutubeDLCookieJar = None, timeout: float | int | None = None, proxies: dict = None, source_address: str = None, @@ -208,7 +208,7 @@ def __init__( self._logger = logger self.headers = headers or {} - self.cookiejar = cookiejar if cookiejar is not None else CookieJar() + self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar() self.timeout = float(timeout or 20) self.proxies = proxies or {} self.source_address = source_address @@ -275,7 +275,7 @@ def _check_proxies(self, proxies): def _check_extensions(self, extensions): """Check extensions for unsupported extensions. Subclasses should extend this.""" - assert isinstance(extensions.get('cookiejar'), (CookieJar, NoneType)) + assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType)) assert isinstance(extensions.get('timeout'), (float, int, NoneType)) def _validate(self, request): @@ -302,6 +302,7 @@ def send(self, request: Request) -> Response: @abc.abstractmethod def _send(self, request: Request): """Handle a request from start to finish. Redefine in subclasses.""" + pass def close(self): pass From 546b2c28a106cf8101d481b215b676d1b091d276 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 30 Jul 2023 10:50:25 +1200 Subject: [PATCH 319/501] [ie/youtube] Fix `player_params` arg being converted to lowercase Fix bug in ba06d77a316650ff057347d224b5afa8b203ad65 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 940a4995b5..1e16631b18 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3597,7 +3597,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, if _split_innertube_client(client)[0] == 'android': yt_query['params'] = 'CgIQBg==' - pp_arg = self._configuration_arg('player_params', [None])[0] + pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] if pp_arg: yt_query['params'] = pp_arg @@ -4018,7 +4018,7 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - pp = self._configuration_arg('player_params', [None])[0] + pp = self._configuration_arg('player_params', [None], casesense=True)[0] if pp: query['pp'] = pp webpage = self._download_webpage( From f73c11803579889dc8e1c99e25dba9a22fef39d8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 30 Jul 2023 04:24:38 +0530 Subject: [PATCH 320/501] `FFmpegFixupM3u8PP` may need to run with ffmpeg Bug in 62b5c94cadaa5f596dc1a7083db9db12efe357be Closes #7725 --- yt_dlp/YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index db5932c443..87bca5bbe0 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3450,10 +3450,11 @@ def ffmpeg_fixup(cndn, msg, cls): postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any(( isinstance(pp, FFmpegVideoConvertorPP) and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None) - ) for pp in self._pps['post_process']) or fd == FFmpegFD + ) for pp in self._pps['post_process']) if not postprocessed_by_ffmpeg: - ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash', + ffmpeg_fixup(fd != FFmpegFD and ext == 'm4a' + and info_dict.get('container') == 'm4a_dash', 'writing DASH m4a. Only some players support this container', FFmpegFixupM4aPP) ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts') From 6014355c6142f68e20c8374e3787e5b5820f19e2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 29 Jul 2023 18:37:06 -0500 Subject: [PATCH 321/501] [ie/twitter] Add fallback, improve error handling (#7621) Closes #7579, Closes #7625 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 81 +++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 9d87dbc4be..34b8625c31 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,3 +1,4 @@ +import functools import json import re @@ -279,6 +280,12 @@ def input_dict(subtask_id, text): 'Submitting confirmation code', headers, data=build_login_json(input_dict( next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + elif next_subtask == 'ArkoseLogin': + self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies') + + elif next_subtask == 'DenyLoginSubtask': + self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies') + elif next_subtask == 'LoginSuccessSubtask': raise ExtractorError('Twitter API did not grant auth token cookie') @@ -304,8 +311,9 @@ def _call_api(self, path, video_id, query={}, graphql=False): if result.get('errors'): errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) - raise ExtractorError( - f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) + if errors and 'not authorized' in errors: + self.raise_login_required(remove_end(errors, '.')) + raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}') return result @@ -607,7 +615,7 @@ class TwitterIE(TwitterBaseIE): # has mp4 formats via mobile API 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', 'info_dict': { - 'id': '852138619213144067', + 'id': '852077943283097602', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', @@ -616,8 +624,16 @@ class TwitterIE(TwitterBaseIE): 'duration': 277.4, 'timestamp': 1492000653, 'upload_date': '20170412', + 'display_id': '852138619213144067', + 'age_limit': 0, + 'uploader_url': 'https://twitter.com/news_al3alm', + 'thumbnail': r're:^https?://.*\.jpg', + 'tags': [], + 'repost_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -675,15 +691,15 @@ class TwitterIE(TwitterBaseIE): 'id': '1087791272830607360', 'display_id': '1087791357756956680', 'ext': 'mp4', - 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', + 'title': 'X - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', - 'uploader': 'Twitter', - 'uploader_id': 'Twitter', + 'uploader': 'X', + 'uploader_id': 'X', 'duration': 61.567, 'timestamp': 1548184644, 'upload_date': '20190122', - 'uploader_url': 'https://twitter.com/Twitter', + 'uploader_url': 'https://twitter.com/X', 'comment_count': int, 'repost_count': int, 'like_count': int, @@ -991,10 +1007,10 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün The Shinobi', + 'uploader': 'Mün The Friend Of YWAP', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -1024,6 +1040,7 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, }, 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, + 'skip': 'Protected tweet', }, { # orig tweet w/ graphql 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', @@ -1047,6 +1064,7 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'comment_count': int, }, + 'skip': 'Protected tweet', }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1103,6 +1121,8 @@ def _graphql_to_legacy(self, data, twid): reason = result.get('reason') if reason == 'NsfwLoggedOut': self.raise_login_required('NSFW tweet requires authentication') + elif reason == 'Protected': + self.raise_login_required('You are not authorized to view this protected tweet') raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) status = result.get('legacy', {}) @@ -1187,22 +1207,38 @@ def _build_graphql_query(self, media_id): } } - def _real_extract(self, url): - twid, selected_index = self._match_valid_url(url).group('id', 'index') - if not self.is_logged_in and self._configuration_arg('legacy_api'): - status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { + def _extract_status(self, twid): + if self.is_logged_in: + return self._graphql_to_legacy( + self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + + try: + if not self._configuration_arg('legacy_api'): + return self._graphql_to_legacy( + self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) + return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, 'include_user_entities': 0, 'tweet_mode': 'extended', }), 'retweeted_status', None) - elif not self.is_logged_in: - status = self._graphql_to_legacy( - self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) - else: - status = self._graphql_to_legacy( - self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + + except ExtractorError as e: + if e.expected: + raise + self.report_warning( + f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) + + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={'id': twid}) + status['extended_entities'] = {'media': status.get('mediaDetails')} + return status + + def _real_extract(self, url): + twid, selected_index = self._match_valid_url(url).group('id', 'index') + status = self._extract_status(twid) title = description = traverse_obj( status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' @@ -1230,7 +1266,10 @@ def _real_extract(self, url): } def extract_from_video_info(media): - media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) + media_id = traverse_obj(media, 'id_str', 'id', ( + 'video_info', 'variants', ..., 'url', + {functools.partial(re.search, r'_video/(\d+)/')}, 1 + ), get_all=False, expected_type=str_or_none) or twid self.write_debug(f'Extracting from video info: {media_id}') formats = [] From 6d6081dda1290a85bdab6717f239289e3aa74c8e Mon Sep 17 00:00:00 2001 From: Steve <snixon@gmail.com> Date: Mon, 31 Jul 2023 10:08:37 -0700 Subject: [PATCH 322/501] [extractor/pbs] Add extractor `PBSKidsIE` (#7602) Authored by: snixon Fixes #2440 --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/pbs.py | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9d935a7d16..2ad7e9800e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1420,7 +1420,7 @@ PatreonIE, PatreonCampaignIE ) -from .pbs import PBSIE +from .pbs import PBSIE, PBSKidsIE from .pearvideo import PearVideoIE from .peekvids import PeekVidsIE, PlayVidsIE from .peertube import ( diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 5bdf561db9..2bb2ea9f19 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -11,6 +11,7 @@ orderedSet, strip_jsonp, strip_or_none, + traverse_obj, unified_strdate, url_or_none, US_RATINGS, @@ -696,3 +697,61 @@ def extract_redirect_urls(info): 'subtitles': subtitles, 'chapters': chapters, } + + +class PBSKidsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://pbskids.org/video/molly-of-denali/3030407927', + 'md5': '1ded20a017cc6b53446238f1804ce4c7', + 'info_dict': { + 'id': '3030407927', + 'title': 'Bird in the Hand/Bye-Bye Birdie', + 'channel': 'molly-of-denali', + 'duration': 1540, + 'ext': 'mp4', + 'series': 'Molly of Denali', + 'description': 'md5:d006b2211633685d8ebc8d03b6d5611e', + 'categories': ['Episode'], + 'upload_date': '20190718', + } + }, + { + 'url': 'https://pbskids.org/video/plum-landing/2365205059', + 'md5': '92e5d189851a64ae1d0237a965be71f5', + 'info_dict': { + 'id': '2365205059', + 'title': 'Cooper\'s Favorite Place in Nature', + 'channel': 'plum-landing', + 'duration': 67, + 'ext': 'mp4', + 'series': 'Plum Landing', + 'description': 'md5:657e5fc4356a84ead1c061eb280ff05d', + 'categories': ['Episode'], + 'upload_date': '20140302', + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(meta, { + 'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}), + 'channel': ('show_slug', {str}), + 'description': ('video_obj', 'description', {str}), + 'duration': ('video_obj', 'duration', {int_or_none}), + 'series': ('video_obj', 'program_title', {str}), + 'title': ('video_obj', 'title', {str}), + 'upload_date': ('video_obj', 'air_date', {unified_strdate}), + }) + } From 30b29f37159e9226e2f2d5434c9a4096ac4efa2e Mon Sep 17 00:00:00 2001 From: ischmidt20 <ischmidt20@berkeley.edu> Date: Tue, 1 Aug 2023 03:24:04 -0400 Subject: [PATCH 323/501] [ie/fox] Support foxsports.com (#7724) Authored by: ischmidt20 --- yt_dlp/extractor/fox.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/fox.py b/yt_dlp/extractor/fox.py index 8fb4ada6be..e00e977bdd 100644 --- a/yt_dlp/extractor/fox.py +++ b/yt_dlp/extractor/fox.py @@ -20,7 +20,7 @@ class FOXIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P<id>[\da-fA-F]+)' _TESTS = [{ # clip 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', @@ -50,6 +50,10 @@ class FOXIE(InfoExtractor): # sports event, geo-restricted 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/', 'only_matching': True, + }, { + # fox sports replay, geo-restricted + 'url': 'https://www.foxsports.com/replay/561f3e071347a24e5e877abc56b22e89', + 'only_matching': True, }] _GEO_BYPASS = False _HOME_PAGE_URL = 'https://www.fox.com/' From a854fbec56d5004f5147116a41d1dd050632a579 Mon Sep 17 00:00:00 2001 From: ringus1 <ringus1@users.noreply.github.com> Date: Tue, 1 Aug 2023 16:13:54 +0200 Subject: [PATCH 324/501] [ie/facebook] Add dash manifest URL (#7743) Fixes #7742 Authored by: ringus1 --- yt_dlp/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 574f8e8c95..4fd17b5743 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -481,7 +481,8 @@ def extract_dash_manifest(video, formats): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=video.get('dash_manifest_url'))) def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around From b9de629d78ce31699f2de886071dc257830f9676 Mon Sep 17 00:00:00 2001 From: ifan-t <jacifan2000@gmail.com> Date: Tue, 1 Aug 2023 19:01:59 +0100 Subject: [PATCH 325/501] [ie/S4C] Add extractor (#7730) Authored by: ifan-t --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/s4c.py | 62 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 yt_dlp/extractor/s4c.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2ad7e9800e..63bb55ea77 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1709,6 +1709,7 @@ RuvIE, RuvSpilaIE ) +from .s4c import S4CIE from .safari import ( SafariIE, SafariApiIE, diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py new file mode 100644 index 0000000000..38a9058960 --- /dev/null +++ b/yt_dlp/extractor/s4c.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class S4CIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/programme/861362209', + 'info_dict': { + 'id': '861362209', + 'ext': 'mp4', + 'title': 'Y Swn', + 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', + 'duration': 5340 + }, + }, { + 'url': 'https://www.s4c.cymru/clic/programme/856636948', + 'info_dict': { + 'id': '856636948', + 'ext': 'mp4', + 'title': 'Am Dro', + 'duration': 2880, + 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + details = self._download_json( + f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', + video_id, fatal=False) + + filename = self._download_json( + 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ + 'programme_id': video_id, + 'signed': '0', + 'lang': 'en', + 'mode': 'od', + 'appId': 'clic', + 'streamName': '', + }, note='Downloading player config JSON')['filename'] + m3u8_url = self._download_json( + 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ + 'mode': 'od', + 'application': 'clic', + 'region': 'WW', + 'extra': 'false', + 'thirdParty': 'false', + 'filename': filename, + }, note='Downloading streaming urls JSON')['hls'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(details, ('full_prog_details', 0, { + 'title': (('programme_title', 'series_title'), {str}), + 'description': ('full_billing', {str.strip}), + 'duration': ('duration', {lambda x: int(x) * 60}), + }), get_all=False), + } From db9743894071760f994f640a4c24358f749a78c0 Mon Sep 17 00:00:00 2001 From: Franklin Lee <Frankgoji@users.noreply.github.com> Date: Tue, 1 Aug 2023 11:21:16 -0700 Subject: [PATCH 326/501] [ie/PicartoVod] Fix extractor (#7727) Closes #2926 Authored by: Frankgoji --- yt_dlp/extractor/picarto.py | 54 ++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index 36a062def3..d415ba28e1 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -1,7 +1,10 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( ExtractorError, - js_to_json, + str_or_none, + traverse_obj, ) @@ -84,7 +87,7 @@ def _real_extract(self, url): class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', @@ -94,6 +97,18 @@ class PicartoVodIE(InfoExtractor): 'title': 'ArtofZod_2017.12.12.00.13.23.flv', 'thumbnail': r're:^https?://.*\.jpg' }, + 'skip': 'The VOD does not exist', + }, { + 'url': 'https://picarto.tv/ArtofZod/videos/772650', + 'md5': '00067a0889f1f6869cc512e3e79c521b', + 'info_dict': { + 'id': '772650', + 'ext': 'mp4', + 'title': 'Art of Zod - Drawing and Painting', + 'thumbnail': r're:^https?://.*\.jpg', + 'channel': 'ArtofZod', + 'age_limit': 18, + } }, { 'url': 'https://picarto.tv/videopopout/Plague', 'only_matching': True, @@ -102,21 +117,36 @@ class PicartoVodIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', video_id, query={ + 'query': f'''{{ + video(id: "{video_id}") {{ + id + title + adult + file_name + video_recording_image_url + channel {{ + name + }} + }} +}}''' + })['data']['video'] - vod_info = self._parse_json( - self._search_regex( - r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - 'vod player'), - video_id, transform_source=js_to_json) + file_name = data['file_name'] + netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc formats = self._extract_m3u8_formats( - vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + f'https://{netloc}/stream/hls/{file_name}/index.m3u8', video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, - 'title': video_id, - 'thumbnail': vod_info.get('vodThumb'), + **traverse_obj(data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'thumbnail': 'video_recording_image_url', + 'channel': ('channel', 'name', {str}), + 'age_limit': ('adult', {lambda x: 18 if x else 0}), + }), 'formats': formats, } From db7b054a6111ca387220d0eb87bf342f9c130eb8 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 5 Aug 2023 10:17:48 +1200 Subject: [PATCH 327/501] [networking] Add request handler preference framework (#7603) Preference functions that take a request and a request handler instance can be registered to prioritize different request handlers per request. Authored by: coletdjnz Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> --- test/test_networking.py | 29 +++++++++++++++++++++++---- yt_dlp/YoutubeDL.py | 7 ++++--- yt_dlp/networking/common.py | 40 +++++++++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 11 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 9c33b0d4c6..2622d24da6 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -1035,17 +1035,17 @@ def test_send(self): assert isinstance(director.send(Request('http://')), FakeResponse) def test_unsupported_handlers(self): - director = RequestDirector(logger=FakeLogger()) - director.add_handler(FakeRH(logger=FakeLogger())) - class SupportedRH(RequestHandler): _SUPPORTED_URL_SCHEMES = ['http'] def _send(self, request: Request): return Response(fp=io.BytesIO(b'supported'), headers={}, url=request.url) - # This handler should by default take preference over FakeRH + director = RequestDirector(logger=FakeLogger()) director.add_handler(SupportedRH(logger=FakeLogger())) + director.add_handler(FakeRH(logger=FakeLogger())) + + # First should take preference assert director.send(Request('http://')).read() == b'supported' assert director.send(Request('any://')).read() == b'' @@ -1072,6 +1072,27 @@ def _send(self, request: Request): director.add_handler(UnexpectedRH(logger=FakeLogger)) assert director.send(Request('any://')) + def test_preference(self): + director = RequestDirector(logger=FakeLogger()) + director.add_handler(FakeRH(logger=FakeLogger())) + + class SomeRH(RequestHandler): + _SUPPORTED_URL_SCHEMES = ['http'] + + def _send(self, request: Request): + return Response(fp=io.BytesIO(b'supported'), headers={}, url=request.url) + + def some_preference(rh, request): + return (0 if not isinstance(rh, SomeRH) + else 100 if 'prefer' in request.headers + else -1) + + director.add_handler(SomeRH(logger=FakeLogger())) + director.preferences.add(some_preference) + + assert director.send(Request('http://')).read() == b'' + assert director.send(Request('http://', headers={'prefer': '1'})).read() == b'supported' + # XXX: do we want to move this to test_YoutubeDL.py? class TestYoutubeDLNetworking: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 87bca5bbe0..666d89b461 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -34,7 +34,7 @@ from .extractor.openload import PhantomJSwrapper from .minicurses import format_text from .networking import HEADRequest, Request, RequestDirector -from .networking.common import _REQUEST_HANDLERS +from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES from .networking.exceptions import ( HTTPError, NoSupportingHandlers, @@ -683,7 +683,7 @@ def process_color_policy(stream): self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) self._load_cookies(self.params['http_headers'].get('Cookie')) # compat self.params['http_headers'].pop('Cookie', None) - self._request_director = self.build_request_director(_REQUEST_HANDLERS.values()) + self._request_director = self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES) if auto_init and auto_init != 'no_verbose_header': self.print_debug_header() @@ -4077,7 +4077,7 @@ def urlopen(self, req): except HTTPError as e: # TODO: Remove in a future release raise _CompatHTTPError(e) from e - def build_request_director(self, handlers): + def build_request_director(self, handlers, preferences=None): logger = _YDLLogger(self) headers = self.params['http_headers'].copy() proxies = self.proxies.copy() @@ -4106,6 +4106,7 @@ def build_request_director(self, handlers): }, }), )) + director.preferences.update(preferences or []) return director def encode(self, s): diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 8fba8c1c5a..584c7bb4db 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -31,8 +31,19 @@ ) from ..utils.networking import HTTPHeaderDict, normalize_url -if typing.TYPE_CHECKING: - RequestData = bytes | Iterable[bytes] | typing.IO | None + +def register_preference(*handlers: type[RequestHandler]): + assert all(issubclass(handler, RequestHandler) for handler in handlers) + + def outer(preference: Preference): + @functools.wraps(preference) + def inner(handler, *args, **kwargs): + if not handlers or isinstance(handler, handlers): + return preference(handler, *args, **kwargs) + return 0 + _RH_PREFERENCES.add(inner) + return inner + return outer class RequestDirector: @@ -40,12 +51,17 @@ class RequestDirector: Helper class that, when given a request, forward it to a RequestHandler that supports it. + Preference functions in the form of func(handler, request) -> int + can be registered into the `preferences` set. These are used to sort handlers + in order of preference. + @param logger: Logger instance. @param verbose: Print debug request information to stdout. """ def __init__(self, logger, verbose=False): self.handlers: dict[str, RequestHandler] = {} + self.preferences: set[Preference] = set() self.logger = logger # TODO(Grub4k): default logger self.verbose = verbose @@ -58,6 +74,16 @@ def add_handler(self, handler: RequestHandler): assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler' self.handlers[handler.RH_KEY] = handler + def _get_handlers(self, request: Request) -> list[RequestHandler]: + """Sorts handlers by preference, given a request""" + preferences = { + rh: sum(pref(rh, request) for pref in self.preferences) + for rh in self.handlers.values() + } + self._print_verbose('Handler preferences for this request: %s' % ', '.join( + f'{rh.RH_NAME}={pref}' for rh, pref in preferences.items())) + return sorted(self.handlers.values(), key=preferences.get, reverse=True) + def _print_verbose(self, msg): if self.verbose: self.logger.stdout(f'director: {msg}') @@ -73,8 +99,7 @@ def send(self, request: Request) -> Response: unexpected_errors = [] unsupported_errors = [] - # TODO (future): add a per-request preference system - for handler in reversed(list(self.handlers.values())): + for handler in self._get_handlers(request): self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.') try: handler.validate(request) @@ -530,3 +555,10 @@ def info(self): def getheader(self, name, default=None): deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2) return self.get_header(name, default) + + +if typing.TYPE_CHECKING: + RequestData = bytes | Iterable[bytes] | typing.IO | None + Preference = typing.Callable[[RequestHandler, Request], int] + +_RH_PREFERENCES: set[Preference] = set() From 378ae9f9fb8e8c86e6ac89c4c5b815b48ce93620 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 12 Aug 2023 16:26:08 +1200 Subject: [PATCH 328/501] [ie/youtube] Fix consent cookie (#7774) Fixes #7594 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1e16631b18..023d8fd8c1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -496,16 +496,10 @@ def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): return - consent_id = None - consent = cookies.get('CONSENT') - if consent: - if 'YES' in consent.value: - return - consent_id = self._search_regex( - r'PENDING\+(\d+)', consent.value, 'consent', default=None) - if not consent_id: - consent_id = random.randint(100, 999) - self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + socs = cookies.get('SOCS') + if socs and not socs.value.startswith('CAA'): # not consented + return + self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) def _initialize_pref(self): cookies = self._get_cookies('https://www.youtube.com/') From dab87ca23650fd87184ff5286b53e6985b59f71d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 12 Aug 2023 16:30:23 -0500 Subject: [PATCH 329/501] [cookies] Containers JSON should be opened as utf-8 (#7800) Closes #7797 Authored by: bashonly --- yt_dlp/cookies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 157f5b0c2b..a71fbc28ba 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -138,7 +138,7 @@ def _extract_firefox_cookies(profile, container, logger): containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): raise FileNotFoundError(f'could not read containers.json in {search_root}') - with open(containers_path) as containers: + with open(containers_path, encoding='utf8') as containers: identities = json.load(containers).get('identities', []) container_id = next((context.get('userContextId') for context in identities if container in ( context.get('name'), From 339c339fec095ff4141b20e6aa83629117fb26df Mon Sep 17 00:00:00 2001 From: trainman261 <trainman261@users.noreply.github.com> Date: Sun, 13 Aug 2023 01:58:55 +0200 Subject: [PATCH 330/501] [ie/CBCPlayer] Extract HLS formats and subtitles (#7484) Authored by: trainman261 --- yt_dlp/extractor/cbc.py | 28 +++++++++++++++++++++++++++- yt_dlp/extractor/scrippsnetworks.py | 1 + yt_dlp/extractor/theplatform.py | 22 ++++++++++++++++++++-- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 41e092422b..9413281a57 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -161,7 +161,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, - 'skip': 'Geo-restricted to Canada', + 'skip': 'Geo-restricted to Canada and no longer available', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', @@ -174,6 +174,9 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1425704400, 'upload_date': '20150307', 'uploader': 'CBCC-NEW', + 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'chapters': [], + 'duration': 494.811, }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', @@ -186,6 +189,28 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1320410746, 'upload_date': '20111104', 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'chapters': [], + 'duration': 186.867, + }, + }, { + # Has subtitles + # These broadcasts expire after ~1 month, can find new test URL here: + # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast + 'url': 'http://www.cbc.ca/player/play/2249992771553', + 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', + 'info_dict': { + 'id': '2249992771553', + 'ext': 'mp4', + 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake', + 'description': 'md5:adba28011a56cfa47a080ff198dad27a', + 'timestamp': 1690596000, + 'duration': 2716.333, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', + 'uploader': 'CBCC-NEW', + 'chapters': 'count:5', + 'upload_date': '20230729', }, }] @@ -199,6 +224,7 @@ def _real_extract(self, url): 'force_smil_url': True }), 'id': video_id, + '_format_sort_fields': ('res', 'proto') # Prioritize direct http formats over HLS } diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index c3cee6e4aa..adfd7e5f29 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -115,6 +115,7 @@ class ScrippsNetworksIE(InfoExtractor): 'uploader': 'SCNI-SCND', }, 'add_ie': ['ThePlatform'], + 'expected_warnings': ['No HLS formats found'], }, { 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', 'only_matching': True, diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 8307b912dd..99caeb5f99 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -19,7 +19,11 @@ xpath_with_ns, mimetype2ext, find_xpath_attr, + traverse_obj, + update_url, + urlhandle_detect_ext, ) +from ..networking import HEADRequest default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) @@ -162,7 +166,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', 'info_dict': { @@ -171,7 +176,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', 'uploader': 'EGSM', - } + }, + 'skip': '404 Not Found', }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, @@ -189,6 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', }, + 'skip': '404 Not Found', }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # geo-restricted (US), HLS encrypted with AES-128 @@ -295,6 +302,17 @@ def _real_extract(self, url): formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) + # With some sites, manifest URL must be forced to extract HLS formats + if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')): + m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None) + urlh = self._request_webpage( + HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False) + if urlh and urlhandle_detect_ext(urlh) == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(m3u8_fmts) + self._merge_subtitles(m3u8_subs, target=subtitles) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ From 876b70c8edf4c0147f180bd981fbc4d625cbfb9c Mon Sep 17 00:00:00 2001 From: garret <garret1317@yandex.com> Date: Mon, 14 Aug 2023 19:29:04 +0100 Subject: [PATCH 331/501] [ie/tbsjp] Add episode, program, playlist extractors (#7765) Authored by: garret1317 --- yt_dlp/extractor/_extractors.py | 5 ++ yt_dlp/extractor/tbsjp.py | 152 ++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 yt_dlp/extractor/tbsjp.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 63bb55ea77..d4d3b6074c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1901,6 +1901,11 @@ from .tagesschau import TagesschauIE from .tass import TassIE from .tbs import TBSIE +from .tbsjp import ( + TBSJPEpisodeIE, + TBSJPProgramIE, + TBSJPPlaylistIE, +) from .tdslifeway import TDSLifewayIE from .teachable import ( TeachableIE, diff --git a/yt_dlp/extractor/tbsjp.py b/yt_dlp/extractor/tbsjp.py new file mode 100644 index 0000000000..77ddeca32c --- /dev/null +++ b/yt_dlp/extractor/tbsjp.py @@ -0,0 +1,152 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + clean_html, + get_element_text_and_html_by_tag, + int_or_none, + str_or_none, + traverse_obj, + try_call, + unified_timestamp, + urljoin, +) + + +class TBSJPEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)' + _GEO_BYPASS = False + _TESTS = [{ + 'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010', + 'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually', + 'info_dict': { + 'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始', + 'id': '23613_2044134_1000049010', + 'ext': 'mp4', + 'upload_date': '20230728', + 'duration': 3517, + 'release_timestamp': 1691118230, + 'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始', + 'release_date': '20230804', + 'categories': 'count:11', + 'episode_number': 3, + 'timestamp': 1690522538, + 'description': 'md5:2b796341af1ef772034133174ba4a895', + 'series': 'VIVANT', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False) + episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value')) + + tf_path = self._search_regex( + r'<script[^>]+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config') + tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config') + video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url') + api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key') + + try: + source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id, + headers={'X-Streaks-Api-Key': api_key}, + note='Downloading stream metadata') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.raise_geo_restricted(countries=['JP']) + raise + + formats, subtitles = [], {} + for src in traverse_obj(source_meta, ('sources', ..., 'src')): + fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])), + 'id': video_id, + **traverse_obj(episode, { + 'categories': ('keywords', {list}), + 'id': ('content_id', {str}), + 'description': ('description', 0, 'value'), + 'timestamp': ('created_at', {unified_timestamp}), + 'release_timestamp': ('pub_date', {unified_timestamp}), + 'duration': ('tv_episode_info', 'duration', {int_or_none}), + 'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}), + 'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'), + 'series': ('custom_data', 'program_name'), + }, get_all=False), + 'formats': formats, + 'subtitles': subtitles, + } + + +class TBSJPProgramIE(InfoExtractor): + _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cu.tbs.co.jp/program/23601', + 'playlist_mincount': 4, + 'info_dict': { + 'id': '23601', + 'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'], + 'description': '幼少期の夢は大人になって、どう成長したのだろうか?\nそしてその夢は今後、どのように広がっていくのか?\nいま話題の会社で働く人の「夢の成長」を描く', + 'series': 'ミライカプセル -I have a dream-', + 'title': 'ミライカプセル -I have a dream-' + } + }] + + def _real_extract(self, url): + programme_id = self._match_id(url) + webpage = self._download_webpage(url, programme_id) + meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id) + + programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value')) + + return { + '_type': 'playlist', + 'entries': [self.url_result(f'https://cu.tbs.co.jp/episode/{video_id}', TBSJPEpisodeIE, video_id) + for video_id in traverse_obj(programme, ('custom_data', 'seriesList', 'episodeCode', ...))], + 'id': programme_id, + **traverse_obj(programme, { + 'categories': ('keywords', ...), + 'id': ('tv_episode_info', 'show_content_id', {str_or_none}), + 'description': ('custom_data', 'program_description'), + 'series': ('custom_data', 'program_name'), + 'title': ('custom_data', 'program_name'), + }), + } + + +class TBSJPPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e', + 'playlist_mincount': 4, + 'info_dict': { + 'title': 'まもなく配信終了', + 'id': '184f9970e7ba48e4915f1b252c55015e', + } + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page = self._download_webpage(url, playlist_id) + meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id) + playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id)) + + def entries(): + for entry in traverse_obj(playlist, ('catalogs', 'value', lambda _, v: v['content_id'])): + # TODO: it's likely possible to get all metadata from the playlist page json instead + content_id = entry['content_id'] + content_type = entry.get('content_type') + if content_type == 'tv_show': + yield self.url_result( + f'https://cu.tbs.co.jp/program/{content_id}', TBSJPProgramIE, content_id) + elif content_type == 'tv_episode': + yield self.url_result( + f'https://cu.tbs.co.jp/episode/{content_id}', TBSJPEpisodeIE, content_id) + else: + self.report_warning(f'Skipping "{content_id}" with unsupported content_type "{content_type}"') + + return self.playlist_result(entries(), playlist_id, traverse_obj(playlist, ('display_name', 'value'))) From a0de8bb8601146b8f87bf7cd562eef8bfb4690be Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 20 Aug 2023 11:10:15 -0500 Subject: [PATCH 332/501] [ie/zee5] Update access token endpoint (#7914) Closes #7911 Authored by: bashonly --- yt_dlp/extractor/zee5.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index b4734cc8f1..ca79cf0a71 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -133,8 +133,8 @@ def _perform_login(self, username, password): def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') access_token_request = self._download_json( - 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app', - video_id, note='Downloading access token') + 'https://launchapi.zee5.com/launch?platform_name=web_app', + video_id, note='Downloading access token')['platform_token'] data = { 'x-access-token': access_token_request['token'] } @@ -240,8 +240,8 @@ class Zee5SeriesIE(InfoExtractor): def _entries(self, show_id): access_token_request = self._download_json( - 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app', - show_id, note='Downloading access token') + 'https://launchapi.zee5.com/launch?platform_name=web_app', + show_id, note='Downloading access token')['platform_token'] headers = { 'X-Access-Token': access_token_request['token'], 'Referer': 'https://www.zee5.com/', From ed711897814f3ee0b1822e4205e74133467e8f1c Mon Sep 17 00:00:00 2001 From: trainman261 <trainman261@users.noreply.github.com> Date: Sun, 20 Aug 2023 18:35:57 +0200 Subject: [PATCH 333/501] [ie/CBCPlayerPlaylist] Add extractor (#7870) Authored by: trainman261 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/cbc.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d4d3b6074c..194ad8356f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -303,6 +303,7 @@ from .cbc import ( CBCIE, CBCPlayerIE, + CBCPlayerPlaylistIE, CBCGemIE, CBCGemPlaylistIE, CBCGemLiveIE, diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 9413281a57..b3c5471f7b 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -2,6 +2,7 @@ import json import base64 import time +import urllib.parse from .common import InfoExtractor from ..compat import ( @@ -228,6 +229,38 @@ def _real_extract(self, url): } +class CBCPlayerPlaylistIE(InfoExtractor): + IE_NAME = 'cbc.ca:player:playlist' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:player/)(?!play/)(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/tv shows/the national/latest broadcast', + } + }, { + 'url': 'https://www.cbc.ca/player/news/Canada/North', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/canada/north', + } + }] + + def _real_extract(self, url): + playlist_id = urllib.parse.unquote(self._match_id(url)).lower() + webpage = self._download_webpage(url, playlist_id) + json_content = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', playlist_id) + + def entries(): + for video_id in traverse_obj(json_content, ( + 'video', 'clipsByCategory', lambda k, _: k.lower() == playlist_id, 'items', ..., 'id' + )): + yield self.url_result(f'https://www.cbc.ca/player/play/{video_id}', CBCPlayerIE) + + return self.playlist_result(entries(), playlist_id) + + class CBCGemIE(InfoExtractor): IE_NAME = 'gem.cbc.ca' _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' From 7cccab79e7d00ed965b48b8cefce1da8a0513409 Mon Sep 17 00:00:00 2001 From: Davin Kevin <davin.kevin@gmail.com> Date: Sun, 20 Aug 2023 19:25:49 +0200 Subject: [PATCH 334/501] [ie/wat.tv] Fix extraction (#7898) Closes #7303 Authored by: davinkevin --- yt_dlp/extractor/tf1.py | 19 +++++++++++++++++++ yt_dlp/extractor/wat.py | 14 +++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/tf1.py b/yt_dlp/extractor/tf1.py index 4cf0322b35..aba4927ae8 100644 --- a/yt_dlp/extractor/tf1.py +++ b/yt_dlp/extractor/tf1.py @@ -27,6 +27,25 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + }, { + 'url': 'https://www.tf1.fr/tmc/burger-quiz/videos/burger-quiz-du-19-aout-2023-s03-episode-21-85585666.html', + 'info_dict': { + 'id': '14010600', + 'ext': 'mp4', + 'title': 'Burger Quiz - S03 EP21 avec Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï', + 'thumbnail': 'https://photos.tf1.fr/1280/720/burger-quiz-11-9adb79-0@1x.jpg', + 'description': 'Manu Payet recevra Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï.', + 'upload_date': '20230819', + 'timestamp': 1692469471, + 'season_number': 3, + 'series': 'Burger Quiz', + 'episode_number': 21, + 'season': 'Season 3', + 'tags': 'count:13', + 'episode': 'Episode 21', + 'duration': 2312 + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py index 7c62d2866a..9ea3fddd63 100644 --- a/yt_dlp/extractor/wat.py +++ b/yt_dlp/extractor/wat.py @@ -41,6 +41,18 @@ class WatIE(InfoExtractor): 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], 'skip': 'This content is no longer available', }, + { + 'url': 'wat:14010600', + 'info_dict': { + 'id': '14010600', + 'title': 'Burger Quiz - S03 EP21 avec Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï', + 'thumbnail': 'https://photos.tf1.fr/1280/720/burger-quiz-11-9adb79-0@1x.jpg', + 'upload_date': '20230819', + 'duration': 2312, + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + } ] _GEO_BYPASS = False @@ -54,7 +66,7 @@ def _real_extract(self, url): # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1', 'pver': '4020003'}) + video_id, query={'pver': '5010000'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') From fcd6a76adc49d5cd8783985c7ce35384b72e545f Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 25 Aug 2023 07:10:44 +0000 Subject: [PATCH 335/501] [tests] Add tests for socks proxies (#7908) Authored by: coletdjnz --- test/conftest.py | 21 ++ test/test_networking.py | 16 -- test/test_socks.py | 529 +++++++++++++++++++++++++++++++++------- 3 files changed, 464 insertions(+), 102 deletions(-) create mode 100644 test/conftest.py diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000000..15549d30b9 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,21 @@ +import functools +import inspect + +import pytest + +from yt_dlp.networking import RequestHandler +from yt_dlp.networking.common import _REQUEST_HANDLERS +from yt_dlp.utils._utils import _YDLLogger as FakeLogger + + +@pytest.fixture +def handler(request): + RH_KEY = request.param + if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler): + handler = RH_KEY + elif RH_KEY in _REQUEST_HANDLERS: + handler = _REQUEST_HANDLERS[RH_KEY] + else: + pytest.skip(f'{RH_KEY} request handler is not available') + + return functools.partial(handler, logger=FakeLogger) diff --git a/test/test_networking.py b/test/test_networking.py index 2622d24da6..5308c8d6fa 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -8,12 +8,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import functools import gzip import http.client import http.cookiejar import http.server -import inspect import io import pathlib import random @@ -40,7 +38,6 @@ Response, ) from yt_dlp.networking._urllib import UrllibRH -from yt_dlp.networking.common import _REQUEST_HANDLERS from yt_dlp.networking.exceptions import ( CertificateVerifyError, HTTPError, @@ -307,19 +304,6 @@ def setup_class(cls): cls.https_server_thread.start() -@pytest.fixture -def handler(request): - RH_KEY = request.param - if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler): - handler = RH_KEY - elif RH_KEY in _REQUEST_HANDLERS: - handler = _REQUEST_HANDLERS[RH_KEY] - else: - pytest.skip(f'{RH_KEY} request handler is not available') - - return functools.partial(handler, logger=FakeLogger) - - class TestHTTPRequestHandler(TestRequestHandlerBase): @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) def test_verify_cert(self, handler): diff --git a/test/test_socks.py b/test/test_socks.py index 6651290d27..95ffce275b 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -1,113 +1,470 @@ #!/usr/bin/env python3 - # Allow direct execution import os import sys +import threading import unittest +import pytest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - +import abc +import contextlib +import enum +import functools +import http.server +import json import random -import subprocess -import urllib.request +import socket +import struct +import time +from socketserver import ( + BaseRequestHandler, + StreamRequestHandler, + ThreadingTCPServer, +) -from test.helper import FakeYDL, get_params, is_download_test +from test.helper import http_server_port +from yt_dlp.networking import Request +from yt_dlp.networking.exceptions import ProxyError, TransportError +from yt_dlp.socks import ( + SOCKS4_REPLY_VERSION, + SOCKS4_VERSION, + SOCKS5_USER_AUTH_SUCCESS, + SOCKS5_USER_AUTH_VERSION, + SOCKS5_VERSION, + Socks5AddressType, + Socks5Auth, +) + +SOCKS5_USER_AUTH_FAILURE = 0x1 -@is_download_test -class TestMultipleSocks(unittest.TestCase): - @staticmethod - def _check_params(attrs): - params = get_params() - for attr in attrs: - if attr not in params: - print('Missing %s. Skipping.' % attr) +class Socks4CD(enum.IntEnum): + REQUEST_GRANTED = 90 + REQUEST_REJECTED_OR_FAILED = 91 + REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD = 92 + REQUEST_REJECTED_DIFFERENT_USERID = 93 + + +class Socks5Reply(enum.IntEnum): + SUCCEEDED = 0x0 + GENERAL_FAILURE = 0x1 + CONNECTION_NOT_ALLOWED = 0x2 + NETWORK_UNREACHABLE = 0x3 + HOST_UNREACHABLE = 0x4 + CONNECTION_REFUSED = 0x5 + TTL_EXPIRED = 0x6 + COMMAND_NOT_SUPPORTED = 0x7 + ADDRESS_TYPE_NOT_SUPPORTED = 0x8 + + +class SocksTestRequestHandler(BaseRequestHandler): + + def __init__(self, *args, socks_info=None, **kwargs): + self.socks_info = socks_info + super().__init__(*args, **kwargs) + + +class SocksProxyHandler(BaseRequestHandler): + def __init__(self, request_handler_class, socks_server_kwargs, *args, **kwargs): + self.socks_kwargs = socks_server_kwargs or {} + self.request_handler_class = request_handler_class + super().__init__(*args, **kwargs) + + +class Socks5ProxyHandler(StreamRequestHandler, SocksProxyHandler): + + # SOCKS5 protocol https://tools.ietf.org/html/rfc1928 + # SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929 + + def handle(self): + sleep = self.socks_kwargs.get('sleep') + if sleep: + time.sleep(sleep) + version, nmethods = self.connection.recv(2) + assert version == SOCKS5_VERSION + methods = list(self.connection.recv(nmethods)) + + auth = self.socks_kwargs.get('auth') + + if auth is not None and Socks5Auth.AUTH_USER_PASS not in methods: + self.connection.sendall(struct.pack('!BB', SOCKS5_VERSION, Socks5Auth.AUTH_NO_ACCEPTABLE)) + self.server.close_request(self.request) + return + + elif Socks5Auth.AUTH_USER_PASS in methods: + self.connection.sendall(struct.pack("!BB", SOCKS5_VERSION, Socks5Auth.AUTH_USER_PASS)) + + _, user_len = struct.unpack('!BB', self.connection.recv(2)) + username = self.connection.recv(user_len).decode() + pass_len = ord(self.connection.recv(1)) + password = self.connection.recv(pass_len).decode() + + if username == auth[0] and password == auth[1]: + self.connection.sendall(struct.pack('!BB', SOCKS5_USER_AUTH_VERSION, SOCKS5_USER_AUTH_SUCCESS)) + else: + self.connection.sendall(struct.pack('!BB', SOCKS5_USER_AUTH_VERSION, SOCKS5_USER_AUTH_FAILURE)) + self.server.close_request(self.request) return - return params - def test_proxy_http(self): - params = self._check_params(['primary_proxy', 'primary_server_ip']) - if params is None: - return - ydl = FakeYDL({ - 'proxy': params['primary_proxy'] - }) - self.assertEqual( - ydl.urlopen('http://yt-dl.org/ip').read().decode(), - params['primary_server_ip']) - - def test_proxy_https(self): - params = self._check_params(['primary_proxy', 'primary_server_ip']) - if params is None: - return - ydl = FakeYDL({ - 'proxy': params['primary_proxy'] - }) - self.assertEqual( - ydl.urlopen('https://yt-dl.org/ip').read().decode(), - params['primary_server_ip']) - - def test_secondary_proxy_http(self): - params = self._check_params(['secondary_proxy', 'secondary_server_ip']) - if params is None: - return - ydl = FakeYDL() - req = urllib.request.Request('http://yt-dl.org/ip') - req.add_header('Ytdl-request-proxy', params['secondary_proxy']) - self.assertEqual( - ydl.urlopen(req).read().decode(), - params['secondary_server_ip']) - - def test_secondary_proxy_https(self): - params = self._check_params(['secondary_proxy', 'secondary_server_ip']) - if params is None: - return - ydl = FakeYDL() - req = urllib.request.Request('https://yt-dl.org/ip') - req.add_header('Ytdl-request-proxy', params['secondary_proxy']) - self.assertEqual( - ydl.urlopen(req).read().decode(), - params['secondary_server_ip']) - - -@is_download_test -class TestSocks(unittest.TestCase): - _SKIP_SOCKS_TEST = True - - def setUp(self): - if self._SKIP_SOCKS_TEST: + elif Socks5Auth.AUTH_NONE in methods: + self.connection.sendall(struct.pack('!BB', SOCKS5_VERSION, Socks5Auth.AUTH_NONE)) + else: + self.connection.sendall(struct.pack('!BB', SOCKS5_VERSION, Socks5Auth.AUTH_NO_ACCEPTABLE)) + self.server.close_request(self.request) return - self.port = random.randint(20000, 30000) - self.server_process = subprocess.Popen([ - 'srelay', '-f', '-i', '127.0.0.1:%d' % self.port], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + version, command, _, address_type = struct.unpack('!BBBB', self.connection.recv(4)) + socks_info = { + 'version': version, + 'auth_methods': methods, + 'command': command, + 'client_address': self.client_address, + 'ipv4_address': None, + 'domain_address': None, + 'ipv6_address': None, + } + if address_type == Socks5AddressType.ATYP_IPV4: + socks_info['ipv4_address'] = socket.inet_ntoa(self.connection.recv(4)) + elif address_type == Socks5AddressType.ATYP_DOMAINNAME: + socks_info['domain_address'] = self.connection.recv(ord(self.connection.recv(1))).decode() + elif address_type == Socks5AddressType.ATYP_IPV6: + socks_info['ipv6_address'] = socket.inet_ntop(socket.AF_INET6, self.connection.recv(16)) + else: + self.server.close_request(self.request) - def tearDown(self): - if self._SKIP_SOCKS_TEST: + socks_info['port'] = struct.unpack('!H', self.connection.recv(2))[0] + + # dummy response, the returned IP is just a placeholder + self.connection.sendall(struct.pack( + '!BBBBIH', SOCKS5_VERSION, self.socks_kwargs.get('reply', Socks5Reply.SUCCEEDED), 0x0, 0x1, 0x7f000001, 40000)) + + self.request_handler_class(self.request, self.client_address, self.server, socks_info=socks_info) + + +class Socks4ProxyHandler(StreamRequestHandler, SocksProxyHandler): + + # SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol + # SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol + + def _read_until_null(self): + return b''.join(iter(functools.partial(self.connection.recv, 1), b'\x00')) + + def handle(self): + sleep = self.socks_kwargs.get('sleep') + if sleep: + time.sleep(sleep) + socks_info = { + 'version': SOCKS4_VERSION, + 'command': None, + 'client_address': self.client_address, + 'ipv4_address': None, + 'port': None, + 'domain_address': None, + } + version, command, dest_port, dest_ip = struct.unpack('!BBHI', self.connection.recv(8)) + socks_info['port'] = dest_port + socks_info['command'] = command + if version != SOCKS4_VERSION: + self.server.close_request(self.request) + return + use_remote_dns = False + if 0x0 < dest_ip <= 0xFF: + use_remote_dns = True + else: + socks_info['ipv4_address'] = socket.inet_ntoa(struct.pack("!I", dest_ip)) + + user_id = self._read_until_null().decode() + if user_id != (self.socks_kwargs.get('user_id') or ''): + self.connection.sendall(struct.pack( + '!BBHI', SOCKS4_REPLY_VERSION, Socks4CD.REQUEST_REJECTED_DIFFERENT_USERID, 0x00, 0x00000000)) + self.server.close_request(self.request) return - self.server_process.terminate() - self.server_process.communicate() + if use_remote_dns: + socks_info['domain_address'] = self._read_until_null().decode() - def _get_ip(self, protocol): - if self._SKIP_SOCKS_TEST: - return '127.0.0.1' + # dummy response, the returned IP is just a placeholder + self.connection.sendall( + struct.pack( + '!BBHI', SOCKS4_REPLY_VERSION, + self.socks_kwargs.get('cd_reply', Socks4CD.REQUEST_GRANTED), 40000, 0x7f000001)) - ydl = FakeYDL({ - 'proxy': '%s://127.0.0.1:%d' % (protocol, self.port), - }) - return ydl.urlopen('http://yt-dl.org/ip').read().decode() + self.request_handler_class(self.request, self.client_address, self.server, socks_info=socks_info) - def test_socks4(self): - self.assertTrue(isinstance(self._get_ip('socks4'), str)) - def test_socks4a(self): - self.assertTrue(isinstance(self._get_ip('socks4a'), str)) +class IPv6ThreadingTCPServer(ThreadingTCPServer): + address_family = socket.AF_INET6 - def test_socks5(self): - self.assertTrue(isinstance(self._get_ip('socks5'), str)) + +class SocksHTTPTestRequestHandler(http.server.BaseHTTPRequestHandler, SocksTestRequestHandler): + def do_GET(self): + if self.path == '/socks_info': + payload = json.dumps(self.socks_info.copy()) + self.send_response(200) + self.send_header('Content-Type', 'application/json; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload.encode()) + + +@contextlib.contextmanager +def socks_server(socks_server_class, request_handler, bind_ip=None, **socks_server_kwargs): + server = server_thread = None + try: + bind_address = bind_ip or '127.0.0.1' + server_type = ThreadingTCPServer if '.' in bind_address else IPv6ThreadingTCPServer + server = server_type( + (bind_address, 0), functools.partial(socks_server_class, request_handler, socks_server_kwargs)) + server_port = http_server_port(server) + server_thread = threading.Thread(target=server.serve_forever) + server_thread.daemon = True + server_thread.start() + if '.' not in bind_address: + yield f'[{bind_address}]:{server_port}' + else: + yield f'{bind_address}:{server_port}' + finally: + server.shutdown() + server.server_close() + server_thread.join(2.0) + + +class SocksProxyTestContext(abc.ABC): + REQUEST_HANDLER_CLASS = None + + def socks_server(self, server_class, *args, **kwargs): + return socks_server(server_class, self.REQUEST_HANDLER_CLASS, *args, **kwargs) + + @abc.abstractmethod + def socks_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs) -> dict: + """return a dict of socks_info""" + + +class HTTPSocksTestProxyContext(SocksProxyTestContext): + REQUEST_HANDLER_CLASS = SocksHTTPTestRequestHandler + + def socks_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs): + request = Request(f'http://{target_domain or "127.0.0.1"}:{target_port or "40000"}/socks_info', **req_kwargs) + handler.validate(request) + return json.loads(handler.send(request).read().decode()) + + +CTX_MAP = { + 'http': HTTPSocksTestProxyContext, +} + + +@pytest.fixture(scope='module') +def ctx(request): + return CTX_MAP[request.param]() + + +class TestSocks4Proxy: + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks4_no_auth(self, handler, ctx): + with handler() as rh: + with ctx.socks_server(Socks4ProxyHandler) as server_address: + response = ctx.socks_info_request( + rh, proxies={'all': f'socks4://{server_address}'}) + assert response['version'] == 4 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks4_auth(self, handler, ctx): + with handler() as rh: + with ctx.socks_server(Socks4ProxyHandler, user_id='user') as server_address: + with pytest.raises(ProxyError): + ctx.socks_info_request(rh, proxies={'all': f'socks4://{server_address}'}) + response = ctx.socks_info_request( + rh, proxies={'all': f'socks4://user:@{server_address}'}) + assert response['version'] == 4 + + @pytest.mark.parametrize('handler,ctx', [ + pytest.param('Urllib', 'http', marks=pytest.mark.xfail( + reason='socks4a implementation currently broken when destination is not a domain name')) + ], indirect=True) + def test_socks4a_ipv4_target(self, handler, ctx): + with ctx.socks_server(Socks4ProxyHandler) as server_address: + with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='127.0.0.1') + assert response['version'] == 4 + assert response['ipv4_address'] == '127.0.0.1' + assert response['domain_address'] is None + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks4a_domain_target(self, handler, ctx): + with ctx.socks_server(Socks4ProxyHandler) as server_address: + with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='localhost') + assert response['version'] == 4 + assert response['ipv4_address'] is None + assert response['domain_address'] == 'localhost' + + @pytest.mark.parametrize('handler,ctx', [ + pytest.param('Urllib', 'http', marks=pytest.mark.xfail( + reason='source_address is not yet supported for socks4 proxies')) + ], indirect=True) + def test_ipv4_client_source_address(self, handler, ctx): + with ctx.socks_server(Socks4ProxyHandler) as server_address: + source_address = f'127.0.0.{random.randint(5, 255)}' + with handler(proxies={'all': f'socks4://{server_address}'}, + source_address=source_address) as rh: + response = ctx.socks_info_request(rh) + assert response['client_address'][0] == source_address + assert response['version'] == 4 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('reply_code', [ + Socks4CD.REQUEST_REJECTED_OR_FAILED, + Socks4CD.REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD, + Socks4CD.REQUEST_REJECTED_DIFFERENT_USERID, + ]) + def test_socks4_errors(self, handler, ctx, reply_code): + with ctx.socks_server(Socks4ProxyHandler, cd_reply=reply_code) as server_address: + with handler(proxies={'all': f'socks4://{server_address}'}) as rh: + with pytest.raises(ProxyError): + ctx.socks_info_request(rh) + + @pytest.mark.parametrize('handler,ctx', [ + pytest.param('Urllib', 'http', marks=pytest.mark.xfail( + reason='IPv6 socks4 proxies are not yet supported')) + ], indirect=True) + def test_ipv6_socks4_proxy(self, handler, ctx): + with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: + with handler(proxies={'all': f'socks4://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='127.0.0.1') + assert response['client_address'][0] == '::1' + assert response['ipv4_address'] == '127.0.0.1' + assert response['version'] == 4 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_timeout(self, handler, ctx): + with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: + with handler(proxies={'all': f'socks4://{server_address}'}, timeout=1) as rh: + with pytest.raises(TransportError): + ctx.socks_info_request(rh) + + +class TestSocks5Proxy: + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks5_no_auth(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}) as rh: + response = ctx.socks_info_request(rh) + assert response['auth_methods'] == [0x0] + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks5_user_pass(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler, auth=('test', 'testpass')) as server_address: + with handler() as rh: + with pytest.raises(ProxyError): + ctx.socks_info_request(rh, proxies={'all': f'socks5://{server_address}'}) + + response = ctx.socks_info_request( + rh, proxies={'all': f'socks5://test:testpass@{server_address}'}) + + assert response['auth_methods'] == [Socks5Auth.AUTH_NONE, Socks5Auth.AUTH_USER_PASS] + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks5_ipv4_target(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='127.0.0.1') + assert response['ipv4_address'] == '127.0.0.1' + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks5_domain_target(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='localhost') + assert response['ipv4_address'] == '127.0.0.1' + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks5h_domain_target(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='localhost') + assert response['ipv4_address'] is None + assert response['domain_address'] == 'localhost' + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_socks5h_ip_target(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='127.0.0.1') + assert response['ipv4_address'] == '127.0.0.1' + assert response['domain_address'] is None + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [ + pytest.param('Urllib', 'http', marks=pytest.mark.xfail( + reason='IPv6 destination addresses are not yet supported')) + ], indirect=True) + def test_socks5_ipv6_destination(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='[::1]') + assert response['ipv6_address'] == '::1' + assert response['port'] == 80 + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [ + pytest.param('Urllib', 'http', marks=pytest.mark.xfail( + reason='IPv6 socks5 proxies are not yet supported')) + ], indirect=True) + def test_ipv6_socks5_proxy(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}) as rh: + response = ctx.socks_info_request(rh, target_domain='127.0.0.1') + assert response['client_address'][0] == '::1' + assert response['ipv4_address'] == '127.0.0.1' + assert response['version'] == 5 + + # XXX: is there any feasible way of testing IPv6 source addresses? + # Same would go for non-proxy source_address test... + @pytest.mark.parametrize('handler,ctx', [ + pytest.param('Urllib', 'http', marks=pytest.mark.xfail( + reason='source_address is not yet supported for socks5 proxies')) + ], indirect=True) + def test_ipv4_client_source_address(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler) as server_address: + source_address = f'127.0.0.{random.randint(5, 255)}' + with handler(proxies={'all': f'socks5://{server_address}'}, source_address=source_address) as rh: + response = ctx.socks_info_request(rh) + assert response['client_address'][0] == source_address + assert response['version'] == 5 + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + @pytest.mark.parametrize('reply_code', [ + Socks5Reply.GENERAL_FAILURE, + Socks5Reply.CONNECTION_NOT_ALLOWED, + Socks5Reply.NETWORK_UNREACHABLE, + Socks5Reply.HOST_UNREACHABLE, + Socks5Reply.CONNECTION_REFUSED, + Socks5Reply.TTL_EXPIRED, + Socks5Reply.COMMAND_NOT_SUPPORTED, + Socks5Reply.ADDRESS_TYPE_NOT_SUPPORTED, + ]) + def test_socks5_errors(self, handler, ctx, reply_code): + with ctx.socks_server(Socks5ProxyHandler, reply=reply_code) as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}) as rh: + with pytest.raises(ProxyError): + ctx.socks_info_request(rh) + + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) + def test_timeout(self, handler, ctx): + with ctx.socks_server(Socks5ProxyHandler, sleep=2) as server_address: + with handler(proxies={'all': f'socks5://{server_address}'}, timeout=1) as rh: + with pytest.raises(TransportError): + ctx.socks_info_request(rh) if __name__ == '__main__': From 1be0a96a4d14f629097509fcc89d15f69a8243c7 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sat, 26 Aug 2023 22:29:56 +0200 Subject: [PATCH 336/501] [docs] Update collaborators Authored by: Grub4K --- CONTRIBUTORS | 2 +- Collaborators.md | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6ccd08931d..6b9b9f4701 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -2,7 +2,6 @@ pukkandan (owner) shirt-dev (collaborator) coletdjnz/colethedj (collaborator) Ashish0804 (collaborator) -nao20010128nao/Lesmiscore (collaborator) bashonly (collaborator) Grub4K (collaborator) h-h-h-h @@ -467,3 +466,4 @@ nnoboa rdamas RfadnjdExt urectanc +nao20010128nao/Lesmiscore diff --git a/Collaborators.md b/Collaborators.md index a0976dd8c5..70ab616f11 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -44,16 +44,6 @@ ## [Ashish0804](https://github.com/Ashish0804) <sub><sup>[Inactive]</sup></sub> * Improved/fixed support for HiDive, HotStar, Hungama, LBRY, LinkedInLearning, Mxplayer, SonyLiv, TV2, Vimeo, VLive etc -## [Lesmiscore](https://github.com/Lesmiscore) - -**Bitcoin**: bc1qfd02r007cutfdjwjmyy9w23rjvtls6ncve7r3s -**Monacoin**: mona1q3tf7dzvshrhfe3md379xtvt2n22duhglv5dskr - -* Download live from start to end for YouTube -* Added support for new websites AbemaTV, mildom, PixivSketch, skeb, radiko, voicy, mirrativ, openrec, whowatch, damtomo, 17.live, mixch etc -* Improved/fixed support for fc2, YahooJapanNews, tver, iwara etc - - ## [bashonly](https://github.com/bashonly) * `--update-to`, automated release, nightly builds From 59e92b1f1833440bb2190f847eb735cf0f90bc85 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sun, 27 Aug 2023 00:13:30 +0200 Subject: [PATCH 337/501] [rh/urllib] Simplify gzip decoding (#7611) Authored by: Grub4K --- yt_dlp/networking/_urllib.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 0c4794954b..5a804d99b4 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -1,7 +1,6 @@ from __future__ import annotations import functools -import gzip import http.client import io import socket @@ -155,20 +154,9 @@ def brotli(data): @staticmethod def gz(data): - gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') - try: - return gz.read() - except OSError as original_oserror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') - return gz.read() - except OSError: - continue - else: - raise original_oserror + # There may be junk added the end of the file + # We ignore it by only ever decoding a single gzip payload + return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16) def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not From d7aee8e310b2c4f21d50aac0b420e1b3abde21a4 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Fri, 25 Aug 2023 08:44:05 -0500 Subject: [PATCH 338/501] [ie/Mzaalo] Improve `_VALID_URL` Authored by: bashonly --- yt_dlp/extractor/mzaalo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mzaalo.py b/yt_dlp/extractor/mzaalo.py index c6f420ceaa..1996368cc1 100644 --- a/yt_dlp/extractor/mzaalo.py +++ b/yt_dlp/extractor/mzaalo.py @@ -8,7 +8,7 @@ class MzaaloIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mzaalo\.com/play/(?P<type>movie|original|clip)/(?P<id>[a-fA-F0-9-]+)/[\w-]+' + _VALID_URL = r'(?i)https?://(?:www\.)?mzaalo\.com/(?:play|watch)/(?P<type>movie|original|clip)/(?P<id>[a-f0-9-]+)/[\w-]+' _TESTS = [{ # Movies 'url': 'https://www.mzaalo.com/play/movie/c0958d9f-f90e-4503-a755-44358758921d/Jamun', @@ -55,6 +55,9 @@ class MzaaloIE(InfoExtractor): 'language': 'hin', }, 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://mzaalo.com/watch/MOVIE/389c892d-0b65-4019-bf73-d4edcb1c014f/Chalo-Dilli', + 'only_matching': True, }] def _real_extract(self, url): From 56b3dc03354b75be995759d8441d2754c0442b9a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 27 Aug 2023 18:33:25 -0500 Subject: [PATCH 339/501] [ie/StagePlus] Fix m3u8 extraction (#7929) Closes #7928 Authored by: bashonly --- yt_dlp/extractor/stageplus.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/yt_dlp/extractor/stageplus.py b/yt_dlp/extractor/stageplus.py index adb4ebbc2d..4bed4d646a 100644 --- a/yt_dlp/extractor/stageplus.py +++ b/yt_dlp/extractor/stageplus.py @@ -484,18 +484,15 @@ def _real_extract(self, url): 'url': 'url', })) or None - m3u8_headers = {'jwt': self._TOKEN} - entries = [] for idx, video in enumerate(traverse_obj(data, ( 'performanceWorks', lambda _, v: v['id'] and url_or_none(v['stream']['url']))), 1): formats, subtitles = self._extract_m3u8_formats_and_subtitles( - video['stream']['url'], video['id'], 'mp4', m3u8_id='hls', headers=m3u8_headers) + video['stream']['url'], video['id'], 'mp4', m3u8_id='hls', query={'token': self._TOKEN}) entries.append({ 'id': video['id'], 'formats': formats, 'subtitles': subtitles, - 'http_headers': m3u8_headers, 'album': metadata.get('title'), 'album_artist': metadata.get('artist'), 'track_number': idx, From c2d8ee0000302aba63476b7d5bd8793e57b6c8c6 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:09:14 +0200 Subject: [PATCH 340/501] [ie/weverse] Support extraction without auth (#7924) Authored by: seproDev --- yt_dlp/extractor/weverse.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index 9a08b8e43b..bbf62856a6 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -70,10 +70,8 @@ def _real_initialize(self): return token = try_call(lambda: self._get_cookies('https://weverse.io/')['we2_access_token'].value) - if not token: - self.raise_login_required() - - WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}' + if token: + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}' def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): # Ref: https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/2488.a09b41ff.chunk.js @@ -101,11 +99,14 @@ def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): self.raise_login_required( 'Session token has expired. Log in again or refresh cookies in browser') elif isinstance(e.cause, HTTPError) and e.cause.status == 403: - raise ExtractorError('Your account does not have access to this content', expected=True) + if 'Authorization' in self._API_HEADERS: + raise ExtractorError('Your account does not have access to this content', expected=True) + self.raise_login_required() raise def _call_post_api(self, video_id): - return self._call_api(f'/post/v1.0/post-{video_id}?fieldSet=postV1', video_id) + path = '' if 'Authorization' in self._API_HEADERS else '/preview' + return self._call_api(f'/post/v1.0/post-{video_id}{path}?fieldSet=postV1', video_id) def _get_community_id(self, channel): return str(self._call_api( From b9f2bc2dbed2323734a0d18e65e1e2e23dc833d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Touz=C3=A9?= <60022007+nathantouze@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:33:48 +0200 Subject: [PATCH 341/501] [ie/Dropbox] Fix extractor (#7926) Closes #7005, Closes #7696 Authored by: nathantouze, bashonly, denhotte --- yt_dlp/extractor/dropbox.py | 42 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index 214b309bfd..ec86d7ad24 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -1,3 +1,4 @@ +import base64 import os.path import re @@ -5,14 +6,13 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, - traverse_obj, - try_get, + update_url_query, url_basename, ) class DropboxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*' + _VALID_URL = r'https?://(?:www\.)?dropbox\.com/(?:(?:e/)?scl/fi|sh?)/(?P<id>\w+)' _TESTS = [ { 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0', @@ -22,7 +22,16 @@ class DropboxIE(InfoExtractor): 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } }, { - 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v', + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh', + 'only_matching': True, + }, { + 'url': 'https://www.dropbox.com/sh/2mgpiuq7kv8nqdf/AABy-fW4dkydT4GmWi2mdOUDa?dl=0&preview=Drone+Shot.mp4', + 'only_matching': True, + }, { + 'url': 'https://www.dropbox.com/scl/fi/r2kd2skcy5ylbbta5y1pz/DJI_0003.MP4?dl=0&rlkey=wcdgqangn7t3lnmmv6li9mu9h', + 'only_matching': True, + }, { + 'url': 'https://www.dropbox.com/e/scl/fi/r2kd2skcy5ylbbta5y1pz/DJI_0003.MP4?dl=0&rlkey=wcdgqangn7t3lnmmv6li9mu9h', 'only_matching': True, }, ] @@ -53,16 +62,25 @@ def _real_extract(self, url): else: raise ExtractorError('Password protected video, use --video-password <password>', expected=True) - info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, - contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] - transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) - formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) + formats, subtitles, has_anonymous_download = [], {}, False + for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): + decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') + transcode_url = self._search_regex( + r'\n\x03(https://[^\x12\x03\n]+\.m3u8)', decoded, 'transcode url', default=None) + if not transcode_url: + continue + formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) + has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) + break # downloads enabled we can get the original file - if 'anonymous' in (try_get(info_json, lambda x: x['sharePermission']['canDownloadRoles']) or []): - video_url = re.sub(r'[?&]dl=0', '', url) - video_url += ('?' if '?' not in video_url else '&') + 'dl=1' - formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1}) + if has_anonymous_download: + formats.append({ + 'url': update_url_query(url, {'dl': '1'}), + 'format_id': 'original', + 'format_note': 'Original', + 'quality': 1 + }) return { 'id': video_id, From 665876034c8d3c031443f6b4958bed02ccdf4164 Mon Sep 17 00:00:00 2001 From: Stavros Ntentos <133706+stdedos@users.noreply.github.com> Date: Tue, 29 Aug 2023 03:05:49 +0300 Subject: [PATCH 342/501] [ie/antenna] Support antenna.gr (#7584) Authored by: stdedos --- yt_dlp/extractor/_extractors.py | 4 +- .../extractor/{ant1newsgr.py => antenna.py} | 53 ++++++++++++------- 2 files changed, 36 insertions(+), 21 deletions(-) rename yt_dlp/extractor/{ant1newsgr.py => antenna.py} (72%) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 194ad8356f..f11554bddf 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1699,8 +1699,8 @@ MegaTVComIE, MegaTVComEmbedIE, ) -from .ant1newsgr import ( - Ant1NewsGrWatchIE, +from .antenna import ( + AntennaGrWatchIE, Ant1NewsGrArticleIE, Ant1NewsGrEmbedIE, ) diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/antenna.py similarity index 72% rename from yt_dlp/extractor/ant1newsgr.py rename to yt_dlp/extractor/antenna.py index 217e3acc43..c78717aa9e 100644 --- a/yt_dlp/extractor/ant1newsgr.py +++ b/yt_dlp/extractor/antenna.py @@ -5,22 +5,26 @@ from ..utils import ( ExtractorError, determine_ext, + make_archive_id, scale_thumbnails_to_max_format_width, ) -class Ant1NewsGrBaseIE(InfoExtractor): +class AntennaBaseIE(InfoExtractor): def _download_and_extract_api_data(self, video_id, netloc, cid=None): - url = f'{self.http_scheme()}//{netloc}{self._API_PATH}' - info = self._download_json(url, video_id, query={'cid': cid or video_id}) - try: - source = info['url'] - except KeyError: - raise ExtractorError('no source found for %s' % video_id) - formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') - if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) + info = self._download_json(f'{self.http_scheme()}//{netloc}{self._API_PATH}', + video_id, query={'cid': cid or video_id}) + if not info.get('url'): + raise ExtractorError(f'No source found for {video_id}') + + ext = determine_ext(info['url']) + if ext == 'm3u8': + formats, subs = self._extract_m3u8_formats_and_subtitles(info['url'], video_id, 'mp4') + else: + formats, subs = [{'url': info['url'], 'format_id': ext}], {} + thumbnails = scale_thumbnails_to_max_format_width( - formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') + formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') if info.get('thumb') else [] return { 'id': video_id, 'title': info.get('title'), @@ -30,21 +34,31 @@ def _download_and_extract_api_data(self, video_id, netloc, cid=None): } -class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE): - IE_NAME = 'ant1newsgr:watch' - IE_DESC = 'ant1news.gr videos' - _VALID_URL = r'https?://(?P<netloc>(?:www\.)?ant1news\.gr)/watch/(?P<id>\d+)/' +class AntennaGrWatchIE(AntennaBaseIE): + IE_NAME = 'antenna:watch' + IE_DESC = 'antenna.gr and ant1news.gr videos' + _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:antenna|ant1news)\.gr)/watch/(?P<id>\d+)/' _API_PATH = '/templates/data/player' _TESTS = [{ 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45', - 'md5': '95925e6b32106754235f2417e0d2dfab', + 'md5': 'c472d9dd7cd233c63aff2ea42201cda6', 'info_dict': { 'id': '1506168', 'ext': 'mp4', 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a', 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0', - 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg', + 'thumbnail': r're:https://ant1media\.azureedge\.net/imgHandler/\d+/26d46bf6-8158-4f02-b197-7096c714b2de\.jpg', + }, + }, { + 'url': 'https://www.antenna.gr/watch/1643812/oi-prodotes-epeisodio-01', + 'md5': '8f6f7dd3b1dba4d835ba990e25f31243', + 'info_dict': { + 'id': '1643812', + 'ext': 'mp4', + 'format_id': 'mp4', + 'title': 'ΟΙ ΠΡΟΔΟΤΕΣ – ΕΠΕΙΣΟΔΙΟ 01', + 'thumbnail': r're:https://ant1media\.azureedge\.net/imgHandler/\d+/b3d63096-e72d-43c4-87a0-00d4363d242f\.jpg', }, }] @@ -52,11 +66,12 @@ def _real_extract(self, url): video_id, netloc = self._match_valid_url(url).group('id', 'netloc') webpage = self._download_webpage(url, video_id) info = self._download_and_extract_api_data(video_id, netloc) - info['description'] = self._og_search_description(webpage) + info['description'] = self._og_search_description(webpage, default=None) + info['_old_archive_ids'] = [make_archive_id('Ant1NewsGrWatch', video_id)], return info -class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): +class Ant1NewsGrArticleIE(AntennaBaseIE): IE_NAME = 'ant1newsgr:article' IE_DESC = 'ant1news.gr articles' _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/' @@ -96,7 +111,7 @@ def _real_extract(self, url): video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) -class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): +class Ant1NewsGrEmbedIE(AntennaBaseIE): IE_NAME = 'ant1newsgr:embed' IE_DESC = 'ant1news.gr embedded videos' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' From 4b3a6ef1b3e235ba9a45142830b6edb357c71696 Mon Sep 17 00:00:00 2001 From: Omar Atef <85079143+Yalab7@users.noreply.github.com> Date: Tue, 29 Aug 2023 03:49:29 +0300 Subject: [PATCH 343/501] [ie/hungama] Overhaul extractors (#7757) Closes #7754 Authored by: Yalab7, bashonly --- yt_dlp/extractor/hungama.py | 109 +++++++++++++++++++++++++++--------- 1 file changed, 82 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/hungama.py b/yt_dlp/extractor/hungama.py index 2e9939601f..cdec36838e 100644 --- a/yt_dlp/extractor/hungama.py +++ b/yt_dlp/extractor/hungama.py @@ -1,19 +1,32 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, + remove_end, + traverse_obj, try_get, + unified_timestamp, + url_or_none, urlencode_postdata, ) -class HungamaIE(InfoExtractor): +class HungamaBaseIE(InfoExtractor): + def _call_api(self, path, content_id, fatal=False): + return traverse_obj(self._download_json( + f'https://cpage.api.hungama.com/v2/page/content/{content_id}/{path}/detail', + content_id, fatal=fatal, query={ + 'device': 'web', + 'platform': 'a', + 'storeId': '1', + }), ('data', {dict})) or {} + + +class HungamaIE(HungamaBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?hungama\.com/ + (?:www\.|un\.)?hungama\.com/ (?: - (?:video|movie)/[^/]+/| + (?:video|movie|short-film)/[^/]+/| tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ ) (?P<id>\d+) @@ -25,13 +38,28 @@ class HungamaIE(InfoExtractor): 'id': '39349649', 'ext': 'mp4', 'title': 'Krishna Chants', - 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'description': ' ', 'upload_date': '20180829', 'duration': 264, 'timestamp': 1535500800, 'view_count': int, - 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', - } + 'thumbnail': 'https://images1.hungama.com/tr:n-a_169_m/c/1/0dc/2ca/39349649/39349649_350x197.jpg?v=8', + 'tags': 'count:6', + }, + }, { + 'url': 'https://un.hungama.com/short-film/adira/102524179/', + 'md5': '2278463f5dc9db9054d0c02602d44666', + 'info_dict': { + 'id': '102524179', + 'ext': 'mp4', + 'title': 'Adira', + 'description': 'md5:df20cd4d41eabb33634f06de1025a4b4', + 'upload_date': '20230417', + 'timestamp': 1681689600, + 'view_count': int, + 'thumbnail': 'https://images1.hungama.com/tr:n-a_23_m/c/1/197/ac9/102524179/102524179_350x525.jpg?v=1', + 'tags': 'count:7', + }, }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', 'only_matching': True, @@ -51,14 +79,19 @@ def _real_extract(self, url): 'c': 'common', 'm': 'get_video_mdn_url', }) - formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') - - json_ld = self._search_json_ld( - self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + metadata = self._call_api('movie', video_id) return { - **json_ld, + **traverse_obj(metadata, ('head', 'data', { + 'title': ('title', {str}), + 'description': ('misc', 'description', {str}), + 'duration': ('duration', {int}), # duration in JSON is incorrect if string + 'timestamp': ('releasedate', {unified_timestamp}), + 'view_count': ('misc', 'playcount', {int_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'tags': ('misc', 'keywords', ..., {str}), + })), 'id': video_id, 'formats': formats, 'subtitles': { @@ -71,10 +104,10 @@ def _real_extract(self, url): class HungamaSongIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', - 'md5': 'd4a6a05a394ad0453a9bea3ca00e6024', + 'md5': '964f46828e8b250aa35e5fdcfdcac367', 'info_dict': { 'id': '2931166', 'ext': 'mp3', @@ -83,8 +116,22 @@ class HungamaSongIE(InfoExtractor): 'artist': 'Lucky Ali', 'album': None, 'release_year': 2000, - } - } + 'thumbnail': 'https://stat2.hungama.ind.in/assets/images/default_images/da-200x200.png', + }, + }, { + 'url': 'https://un.hungama.com/song/tum-kya-mile-from-rocky-aur-rani-kii-prem-kahaani/103553672', + 'md5': '964f46828e8b250aa35e5fdcfdcac367', + 'info_dict': { + 'id': '103553672', + 'ext': 'mp3', + 'title': 'md5:5ebeb1e10771b634ce5f700ce68ae5f4', + 'track': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")', + 'artist': 'Pritam Chakraborty, Arijit Singh, Shreya Ghoshal, Amitabh Bhattacharya', + 'album': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")', + 'release_year': 2023, + 'thumbnail': 'https://images.hungama.com/c/1/7c2/c7b/103553671/103553671_200x200.jpg', + }, + }] def _real_extract(self, url): audio_id = self._match_id(url) @@ -122,8 +169,8 @@ def _real_extract(self, url): } -class HungamaAlbumPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/(?:playlists|album)/[^/]+/(?P<id>\d+)' +class HungamaAlbumPlaylistIE(HungamaBaseIE): + _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/(?P<path>playlists|album)/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.hungama.com/album/bhuj-the-pride-of-india/69481490/', 'playlist_mincount': 7, @@ -132,16 +179,24 @@ class HungamaAlbumPlaylistIE(InfoExtractor): }, }, { 'url': 'https://www.hungama.com/playlists/hindi-jan-to-june-2021/123063/', - 'playlist_mincount': 50, + 'playlist_mincount': 33, 'info_dict': { 'id': '123063', }, + }, { + 'url': 'https://un.hungama.com/album/what-jhumka-%3F-from-rocky-aur-rani-kii-prem-kahaani/103891805/', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '103891805', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - ptrn = r'<meta[^>]+?property=[\"\']?music:song:url[\"\']?[^>]+?content=[\"\']?([^\"\']+)' - items = re.findall(ptrn, webpage) - entries = [self.url_result(item, ie=HungamaSongIE.ie_key()) for item in items] - return self.playlist_result(entries, video_id) + playlist_id, path = self._match_valid_url(url).group('id', 'path') + data = self._call_api(remove_end(path, 's'), playlist_id, fatal=True) + + def entries(): + for song_url in traverse_obj(data, ('body', 'rows', ..., 'data', 'misc', 'share', {url_or_none})): + yield self.url_result(song_url, HungamaSongIE) + + return self.playlist_result(entries(), playlist_id) From 099fb1b35cf835303306549f5113d1802d79c9c7 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Tue, 29 Aug 2023 08:06:02 -0500 Subject: [PATCH 344/501] Bugfix for b9f2bc2dbed2323734a0d18e65e1e2e23dc833d8 Authored by: bashonly --- yt_dlp/extractor/dropbox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index ec86d7ad24..bc2efce123 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -66,10 +66,10 @@ def _real_extract(self, url): for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') transcode_url = self._search_regex( - r'\n\x03(https://[^\x12\x03\n]+\.m3u8)', decoded, 'transcode url', default=None) + r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None) if not transcode_url: continue - formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4') has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) break From bae4834245a708fff97219849ec880c319c88bc6 Mon Sep 17 00:00:00 2001 From: RedDeffender <74822209+RedDeffender@users.noreply.github.com> Date: Thu, 31 Aug 2023 01:26:45 +0200 Subject: [PATCH 345/501] [ie/NoodleMagazine] Fix extraction (#7830) Closes #7917 Authored by: RedDeffender --- yt_dlp/extractor/noodlemagazine.py | 31 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/noodlemagazine.py b/yt_dlp/extractor/noodlemagazine.py index e6208956fb..1cea0dbda9 100644 --- a/yt_dlp/extractor/noodlemagazine.py +++ b/yt_dlp/extractor/noodlemagazine.py @@ -1,9 +1,14 @@ from .common import InfoExtractor from ..utils import ( - parse_duration, + extract_attributes, + get_element_html_by_id, + int_or_none, parse_count, - unified_strdate + parse_duration, + unified_strdate, + urljoin, ) +from ..utils.traversal import traverse_obj class NoodleMagazineIE(InfoExtractor): @@ -37,15 +42,21 @@ def _real_extract(self, url): like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None)) upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default='')) - key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key') - playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id) - thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') + player_path = extract_attributes(get_element_html_by_id('iplayer', webpage) or '')['src'] + player_iframe = self._download_webpage( + urljoin('https://adult.noodlemagazine.com', player_path), video_id, 'Downloading iframe page') + playlist_url = self._search_regex( + r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url') + playlist_info = self._download_json( + urljoin('https://adult.noodlemagazine.com', playlist_url), video_id, headers={'Referer': url}) - formats = [{ - 'url': source.get('file'), - 'quality': source.get('label'), - 'ext': source.get('type'), - } for source in playlist_info.get('sources')] + thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') + formats = traverse_obj(playlist_info, ('sources', lambda _, v: v['file'], { + 'url': 'file', + 'format_id': 'label', + 'height': ('label', {int_or_none}), + 'ext': 'type', + })) return { 'id': video_id, From 630a55df8de7747e79aa680959d785dfff2c4b76 Mon Sep 17 00:00:00 2001 From: Grabien <60237587+Grabien@users.noreply.github.com> Date: Thu, 31 Aug 2023 02:49:42 +0300 Subject: [PATCH 346/501] [ie/Mediaite] Fix extraction (#7923) Authored by: Grabien --- yt_dlp/extractor/mediaite.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py index 0f9079b112..ab253920b6 100644 --- a/yt_dlp/extractor/mediaite.py +++ b/yt_dlp/extractor/mediaite.py @@ -81,10 +81,24 @@ class MediaiteIE(InfoExtractor): 'upload_date': '20210930', }, 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/politics/i-cant-read-it-fast-enough-while-defending-trump-larry-kudlow-overwhelmed-by-volume-of-ex-presidents-legal-troubles/', + 'info_dict': { + 'id': 'E6EhDX5z', + 'ext': 'mp4', + 'title': 'Fox Business Network - 4:00 PM - 5:00 PM - 1:39:42 pm - 1:42:20 pm', + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/E6EhDX5z/poster.jpg?width=720', + 'duration': 157, + 'timestamp': 1691015535, + 'upload_date': '20230802', + }, + 'params': {'skip_download': True} }] def _real_extract(self, url): webpage = self._download_webpage(url, None) - id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id') - data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id) + video_id = self._search_regex( + [r'"https://cdn\.jwplayer\.com/players/(\w+)', r'data-video-id\s*=\s*\"([^\"]+)\"'], webpage, 'id') + data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{video_id}', video_id) return self._parse_jwplayer_data(data_json) From 30ea88591b728cca0896018dbf67c2298070c669 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Thu, 31 Aug 2023 15:45:11 -0500 Subject: [PATCH 347/501] [ie/hotstar] Make metadata extraction non-fatal Authored by: bashonly --- yt_dlp/extractor/hotstar.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index cdd9379416..6cadfb5b7d 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -200,8 +200,10 @@ def _real_extract(self, url): video_type = self._TYPE.get(video_type, video_type) cookies = self._get_cookies(url) # Cookies before any request - video_data = self._call_api_v1(f'{video_type}/detail', video_id, - query={'tas': 10000, 'contentId': video_id})['body']['results']['item'] + video_data = traverse_obj( + self._call_api_v1( + f'{video_type}/detail', video_id, fatal=False, query={'tas': 10000, 'contentId': video_id}), + ('body', 'results', 'item', {dict})) or {} if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) From 7237c8dca0590aa7438ade93f927df88c9381ec7 Mon Sep 17 00:00:00 2001 From: Rajeshwaran <54212165+Rajeshwaran2001@users.noreply.github.com> Date: Fri, 1 Sep 2023 02:18:52 +0530 Subject: [PATCH 348/501] [ie/hotstar] Extract `release_year` (#7869) Authored by: Rajeshwaran2001 --- yt_dlp/extractor/hotstar.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 6cadfb5b7d..541792b908 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -142,6 +142,26 @@ class HotStarIE(HotStarBaseIE): 'duration': 1272, 'channel_id': 3, }, + 'skip': 'HTTP Error 504: Gateway Time-out', # XXX: Investigate 504 errors on some episodes + }, { + 'url': 'https://www.hotstar.com/in/shows/kana-kaanum-kaalangal/1260097087/back-to-school/1260097320', + 'info_dict': { + 'id': '1260097320', + 'ext': 'mp4', + 'title': 'Back To School', + 'season': 'Chapter 1', + 'description': 'md5:b0d6a4c8a650681491e7405496fc7e13', + 'timestamp': 1650564000, + 'channel': 'Hotstar Specials', + 'series': 'Kana Kaanum Kaalangal', + 'season_number': 1, + 'season_id': 9441, + 'upload_date': '20220421', + 'episode': 'Back To School', + 'episode_number': 1, + 'duration': 1810, + 'channel_id': 54, + }, }, { 'url': 'https://www.hotstar.com/in/clips/e3-sairat-kahani-pyaar-ki/1000262286', 'info_dict': { @@ -154,6 +174,19 @@ class HotStarIE(HotStarBaseIE): 'timestamp': 1622943900, 'duration': 5395, }, + }, { + 'url': 'https://www.hotstar.com/in/movies/premam/1000091195', + 'info_dict': { + 'id': '1000091195', + 'ext': 'mp4', + 'title': 'Premam', + 'release_year': 2015, + 'description': 'md5:d833c654e4187b5e34757eafb5b72d7f', + 'timestamp': 1462149000, + 'upload_date': '20160502', + 'episode': 'Premam', + 'duration': 8994, + }, }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, @@ -288,6 +321,7 @@ def _real_extract(self, url): 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(traverse_obj(video_data, 'broadcastDate', 'startDate')), + 'release_year': int_or_none(video_data.get('year')), 'formats': formats, 'subtitles': subs, 'channel': video_data.get('channelName'), From 77bff23ee97565bab2e0d75b893a21bf7983219a Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sat, 2 Sep 2023 15:18:04 +0200 Subject: [PATCH 349/501] Bugfix for 59e92b1f1833440bb2190f847eb735cf0f90bc85 Closes #8012 Authored by: Grub4K --- yt_dlp/networking/_urllib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 5a804d99b4..b3e705b844 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -156,6 +156,8 @@ def brotli(data): def gz(data): # There may be junk added the end of the file # We ignore it by only ever decoding a single gzip payload + if not data: + return data return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16) def http_request(self, req): From 2301b5c1b77a65abbb46b72f91e1e4666fd5d985 Mon Sep 17 00:00:00 2001 From: Mattias Wadman <mattias.wadman@gmail.com> Date: Sat, 2 Sep 2023 16:40:11 +0200 Subject: [PATCH 350/501] [ie/SVTPlay] Fix extraction (#7789) Closes #5595 Authored by: wader, dirkf --- yt_dlp/extractor/svt.py | 61 +++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 31bf7f97e6..18da87534f 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -1,3 +1,4 @@ +import json import re from .common import InfoExtractor @@ -6,10 +7,11 @@ determine_ext, dict_get, int_or_none, - unified_timestamp, str_or_none, strip_or_none, + traverse_obj, try_get, + unified_timestamp, ) @@ -163,10 +165,46 @@ class SVTPlayIE(SVTPlayBaseIE): }, }, 'params': { - # skip for now due to download test asserts that segment is > 10000 bytes and svt uses - # init segments that are smaller - # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B - 'skip_download': True, + 'skip_download': 'm3u8', + }, + 'skip': 'Episode is no longer available', + }, { + 'url': 'https://www.svtplay.se/video/emBxBQj', + 'md5': '2382036fd6f8c994856c323fe51c426e', + 'info_dict': { + 'id': 'eyBd9aj', + 'ext': 'mp4', + 'title': '1. Farlig kryssning', + 'timestamp': 1491019200, + 'upload_date': '20170401', + 'duration': 2566, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', + 'age_limit': 0, + 'episode': '1. Farlig kryssning', + 'series': 'Rederiet', + 'subtitles': { + 'sv': 'count:3' + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.svtplay.se/video/jz2rYz7/anders-hansen-moter/james-fallon?info=visa', + 'info_dict': { + 'id': 'jvXAGVb', + 'ext': 'mp4', + 'title': 'James Fallon', + 'timestamp': 1673917200, + 'upload_date': '20230117', + 'duration': 1081, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', + 'age_limit': 0, + 'episode': 'James Fallon', + 'series': 'Anders Hansen möter...', + }, + 'params': { + 'skip_download': 'dash', }, }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', @@ -247,15 +285,16 @@ def _real_extract(self, url): data, lambda x: x['statistics']['dataLake']['content']['id'], compat_str) + if not svt_id: + nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False) + svt_id = traverse_obj(nextjs_data, ( + 'props', 'urqlState', ..., 'data', {json.loads}, 'detailsPageByPath', + 'video', 'svtId', {str}), get_all=False) + if not svt_id: svt_id = self._search_regex( (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), - r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', - r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', - r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', - r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/[\w-]+/[^"\']*\b(?:modalId|id)=([\w-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) From 69dbfe01c47cd078682a87f179f5846e2679e927 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Mon, 4 Sep 2023 11:18:59 -0500 Subject: [PATCH 351/501] Bugfix for bae4834245a708fff97219849ec880c319c88bc6 Authored by: bashonly --- yt_dlp/extractor/noodlemagazine.py | 35 ++++++++++++++++++------------ 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/noodlemagazine.py b/yt_dlp/extractor/noodlemagazine.py index 1cea0dbda9..1c1a763dc2 100644 --- a/yt_dlp/extractor/noodlemagazine.py +++ b/yt_dlp/extractor/noodlemagazine.py @@ -1,7 +1,5 @@ from .common import InfoExtractor from ..utils import ( - extract_attributes, - get_element_html_by_id, int_or_none, parse_count, parse_duration, @@ -42,27 +40,36 @@ def _real_extract(self, url): like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None)) upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default='')) - player_path = extract_attributes(get_element_html_by_id('iplayer', webpage) or '')['src'] + def build_url(url_or_path): + return urljoin('https://adult.noodlemagazine.com', url_or_path) + + headers = {'Referer': url} + player_path = self._html_search_regex( + r'<iframe[^>]+\bid="iplayer"[^>]+\bsrc="([^"]+)"', webpage, 'player path') player_iframe = self._download_webpage( - urljoin('https://adult.noodlemagazine.com', player_path), video_id, 'Downloading iframe page') + build_url(player_path), video_id, 'Downloading iframe page', headers=headers) playlist_url = self._search_regex( r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url') - playlist_info = self._download_json( - urljoin('https://adult.noodlemagazine.com', playlist_url), video_id, headers={'Referer': url}) + playlist_info = self._download_json(build_url(playlist_url), video_id, headers=headers) - thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') - formats = traverse_obj(playlist_info, ('sources', lambda _, v: v['file'], { - 'url': 'file', - 'format_id': 'label', - 'height': ('label', {int_or_none}), - 'ext': 'type', - })) + formats = [] + for source in traverse_obj(playlist_info, ('sources', lambda _, v: v['file'])): + if source.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats( + build_url(source['file']), video_id, 'mp4', fatal=False, m3u8_id='hls')) + else: + formats.append(traverse_obj(source, { + 'url': ('file', {build_url}), + 'format_id': 'label', + 'height': ('label', {int_or_none}), + 'ext': 'type', + })) return { 'id': video_id, 'formats': formats, 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_property('image', webpage, default=None) or playlist_info.get('image'), 'duration': duration, 'description': description, 'tags': tags, From c6ef553792ed48462f9fd0e78143bef6b1a71c2e Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Tue, 5 Sep 2023 01:54:14 -0500 Subject: [PATCH 352/501] [ie/twitter:spaces] Pass referer header to downloader Closes #8029 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 34b8625c31..f86216f8ff 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1618,6 +1618,7 @@ def _real_extract(self, url): is_live = live_status == 'is_live' formats = [] + headers = {'Referer': 'https://twitter.com/'} if live_status == 'is_upcoming': self.raise_no_formats('Twitter Space not started yet', expected=True) elif not is_live and not metadata.get('is_space_available_for_replay'): @@ -1628,7 +1629,7 @@ def _real_extract(self, url): ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live, - headers={'Referer': 'https://twitter.com/'}, fatal=False) if source else [] + headers=headers, fatal=False) if source else [] for fmt in formats: fmt.update({'vcodec': 'none', 'acodec': 'aac'}) if not is_live: @@ -1653,6 +1654,7 @@ def _real_extract(self, url): lambda: int_or_none(metadata['scheduled_start'], scale=1000)), 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, + 'http_headers': headers, } From 99c99c7185f5d8e9b3699a6fc7f86ec663d7b97e Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Tue, 5 Sep 2023 14:58:02 -0500 Subject: [PATCH 353/501] [ie/gofile] Update token Closes #7235 Authored by: bashonly --- yt_dlp/extractor/gofile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py index ddbce2ee8f..8983905839 100644 --- a/yt_dlp/extractor/gofile.py +++ b/yt_dlp/extractor/gofile.py @@ -66,7 +66,7 @@ def _entries(self, file_id): query_params = { 'contentId': file_id, 'token': self._TOKEN, - 'websiteToken': 12345, + 'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js } password = self.get_param('videopassword') if password: From d3d81cc98f554d0adb87d24bfd6fabaaa803944d Mon Sep 17 00:00:00 2001 From: ringus1 <ringus1@users.noreply.github.com> Date: Tue, 5 Sep 2023 22:35:23 +0200 Subject: [PATCH 354/501] [ie/facebook] Fix webpage extraction (#7890) Closes #7901 Authored by: ringus1 --- yt_dlp/extractor/facebook.py | 41 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 4fd17b5743..c30a6b06a0 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ + 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', + 'info_dict': { + 'id': '3676516585958356', + 'ext': 'mp4', + 'title': 'dr Adam Przygoda', + 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', + 'uploader': 'RADIO KICKS FM', + 'upload_date': '20230818', + 'timestamp': 1692346159, + 'thumbnail': r're:^https?://.*', + 'uploader_id': '100063551323670', + 'duration': 3132.184, + 'view_count': int, + 'concurrent_view_count': 0, + }, + }, { 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', 'info_dict': { @@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', + 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', 'duration': 131.03, 'concurrent_view_count': int, }, @@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', + 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', 'concurrent_view_count': int, 'thumbnail': r're:^https?://.*', 'view_count': int, @@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', + 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', 'timestamp': 1549275572, 'duration': 3.413, 'uploader': 'Josef Novak', @@ -401,9 +417,9 @@ def _extract_from_url(self, url, video_id): def extract_metadata(webpage): post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] + r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)] post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) @@ -493,14 +509,14 @@ def process_formats(info): def extract_relay_data(_filter): return self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + r'data-sjs>({.*?%s.*?})</script>' % _filter, webpage, 'replay data', default='{}'), video_id, fatal=False) or {} def extract_relay_prefetched_data(_filter): - replay_data = extract_relay_data(_filter) - for require in (replay_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + return traverse_obj(extract_relay_data(_filter), ( + 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ..., + '__bbox', 'result', 'data', {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ @@ -511,7 +527,7 @@ def extract_relay_prefetched_data(_filter): if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') if data: entries = [] @@ -526,7 +542,8 @@ def parse_graphql_video(video): formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', '')): + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): playable_url = video.get(key) if not playable_url: continue From fe371dcf0ba5ce8d42480eade54eeeac99ab3cb0 Mon Sep 17 00:00:00 2001 From: ifan-t <jacifan2000@gmail.com> Date: Fri, 8 Sep 2023 13:25:43 +0100 Subject: [PATCH 355/501] [ie/S4C] Add series support and extract subs/thumbs (#7776) Authored by: ifan-t --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/s4c.py | 57 +++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f11554bddf..b788737a2d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1710,7 +1710,10 @@ RuvIE, RuvSpilaIE ) -from .s4c import S4CIE +from .s4c import ( + S4CIE, + S4CSeriesIE +) from .safari import ( SafariIE, SafariApiIE, diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py index 38a9058960..990ea2b447 100644 --- a/yt_dlp/extractor/s4c.py +++ b/yt_dlp/extractor/s4c.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import traverse_obj +from ..utils import traverse_obj, url_or_none class S4CIE(InfoExtractor): @@ -11,7 +11,8 @@ class S4CIE(InfoExtractor): 'ext': 'mp4', 'title': 'Y Swn', 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', - 'duration': 5340 + 'duration': 5340, + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg' }, }, { 'url': 'https://www.s4c.cymru/clic/programme/856636948', @@ -21,6 +22,7 @@ class S4CIE(InfoExtractor): 'title': 'Am Dro', 'duration': 2880, 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg' }, }] @@ -30,7 +32,7 @@ def _real_extract(self, url): f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', video_id, fatal=False) - filename = self._download_json( + player_config = self._download_json( 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ 'programme_id': video_id, 'signed': '0', @@ -38,7 +40,13 @@ def _real_extract(self, url): 'mode': 'od', 'appId': 'clic', 'streamName': '', - }, note='Downloading player config JSON')['filename'] + }, note='Downloading player config JSON') + subtitles = {} + for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))): + subtitles.setdefault(sub.get('3', 'en'), []).append({ + 'url': sub['0'], + 'name': sub.get('1'), + }) m3u8_url = self._download_json( 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ 'mode': 'od', @@ -46,17 +54,52 @@ def _real_extract(self, url): 'region': 'WW', 'extra': 'false', 'thirdParty': 'false', - 'filename': filename, + 'filename': player_config['filename'], }, note='Downloading streaming urls JSON')['hls'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, - 'formats': formats, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls'), 'subtitles': subtitles, + 'thumbnail': url_or_none(player_config.get('poster')), **traverse_obj(details, ('full_prog_details', 0, { 'title': (('programme_title', 'series_title'), {str}), 'description': ('full_billing', {str.strip}), 'duration': ('duration', {lambda x: int(x) * 60}), }), get_all=False), } + + +class S4CSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/series/864982911', + 'playlist_mincount': 6, + 'info_dict': { + 'id': '864982911', + 'title': 'Iaith ar Daith', + 'description': 'md5:e878ebf660dce89bd2ef521d7ce06397' + }, + }, { + 'url': 'https://www.s4c.cymru/clic/series/866852587', + 'playlist_mincount': 8, + 'info_dict': { + 'id': '866852587', + 'title': 'FFIT Cymru', + 'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96' + }, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + series_details = self._download_json( + 'https://www.s4c.cymru/df/series_details', series_id, query={ + 'lang': 'e', + 'series_id': series_id, + 'show_prog_in_series': 'Y' + }, note='Downloading series details JSON') + + return self.playlist_result( + [self.url_result(f'https://www.s4c.cymru/clic/programme/{episode_id}', S4CIE, episode_id) + for episode_id in traverse_obj(series_details, ('other_progs_in_series', ..., 'id'))], + series_id, traverse_obj(series_details, ('full_prog_details', 0, 'series_title', {str}))) From 5d0395498d7065aa5e55bac85fa9354b4b0d48eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szaby=20Gr=C3=BCnwald?= <szaby.gruenwald@web.de> Date: Fri, 8 Sep 2023 14:54:41 +0200 Subject: [PATCH 356/501] [ie/wdr] Fix extraction (#7979) Closes #7461 Authored by: szabyg --- yt_dlp/extractor/wdr.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/wdr.py b/yt_dlp/extractor/wdr.py index de5dc26667..6767f26544 100644 --- a/yt_dlp/extractor/wdr.py +++ b/yt_dlp/extractor/wdr.py @@ -173,6 +173,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'skip': 'HTTP Error 404: Not Found', }, { + # FIXME: Asset JSON is directly embedded in webpage 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { 'id': 'mdb-2296252', @@ -221,6 +222,8 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'id': 'mdb-869971', 'ext': 'mp4', 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'COSMO Livestream', + 'live_status': 'is_live', 'upload_date': '20160101', }, 'params': { @@ -248,6 +251,16 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', 'only_matching': True, }, + { + 'url': 'https://www1.wdr.de/mediathek/video/sendungen/rockpalast/video-baroness---freak-valley-festival--100.html', + 'info_dict': { + 'id': 'mdb-2741028', + 'ext': 'mp4', + 'title': 'Baroness - Freak Valley Festival 2022', + 'alt_title': 'Rockpalast', + 'upload_date': '20220725', + }, + } ] def _real_extract(self, url): @@ -259,7 +272,7 @@ def _real_extract(self, url): # Article with several videos - # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de the data-extension-ard is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus, in a tag with the class "videoButton" (previously a link # to the page in a multiline "videoLink"-tag) @@ -268,7 +281,7 @@ def _real_extract(self, url): (?: (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* - )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 + )data-extension(?:-ard)?=(["\'])(?P<data>(?:(?!\3).)+)\3 ''', webpage): media_link_obj = self._parse_json( mobj.group('data'), display_id, transform_source=js_to_json, @@ -295,7 +308,7 @@ def _real_extract(self, url): compat_urlparse.urljoin(url, mobj.group('href')), ie=WDRPageIE.ie_key()) for mobj in re.finditer( - r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=', + r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension(?:-ard)?=', webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) ] From a006ce2b27357c15792eb5c18f06765e640b801c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 9 Sep 2023 10:14:49 -0500 Subject: [PATCH 357/501] [ie/twitter] Fix retweet extraction and syndication API (#8016) Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/twitter.py | 181 ++++++++++++++++++++++++++---------- 2 files changed, 132 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index b82d92a6ec..c7b73f4fd6 100644 --- a/README.md +++ b/README.md @@ -1854,7 +1854,7 @@ #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` #### twitter -* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed +* `api`: Select one of `graphql` (default), `legacy` or `syndication` as the API for tweet extraction. Has no effect if logged in #### stacommu, wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index f86216f8ff..4065acbaaa 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,9 +1,10 @@ -import functools import json +import random import re from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE +from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -147,10 +148,14 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) + @functools.cached_property + def _selected_api(self): + return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] + def _fetch_guest_token(self, display_id): guest_token = traverse_obj(self._download_json( f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', - headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), + headers=self._set_base_headers(legacy=display_id and self._selected_api == 'legacy')), ('guest_token', {str})) if not guest_token: raise ExtractorError('Could not retrieve guest token') @@ -295,7 +300,7 @@ def input_dict(subtask_id, text): self.report_login() def _call_api(self, path, video_id, query={}, graphql=False): - headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) + headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy') headers.update({ 'x-twitter-auth-type': 'OAuth2Session', 'x-twitter-client-language': 'en', @@ -707,6 +712,7 @@ class TwitterIE(TwitterBaseIE): 'tags': [], 'age_limit': 0, }, + 'skip': 'This Tweet is unavailable', }, { # not available in Periscope 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', @@ -721,6 +727,7 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], + 'skip': 'Broadcast no longer exists', }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', @@ -773,9 +780,9 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛 | #вʟм - Test', + 'title': 'Ultima📛| New Era - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛 | #вʟм', + 'uploader': 'Ultima📛| New Era', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -811,7 +818,7 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, }, }, { - # Adult content, fails if not logged in (GraphQL) + # Adult content, fails if not logged in 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', 'info_dict': { 'id': '1575199163847000068', @@ -831,9 +838,10 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 18, 'tags': [] }, + 'params': {'skip_download': 'The media could not be played'}, 'skip': 'Requires authentication', }, { - # Playlist result only with auth + # Playlist result only with graphql API 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -898,7 +906,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad', + 'description': 'md5:acce559345fd49f129c20dbcda3f1201', 'timestamp': 1658407771, 'release_date': '20220721', 'upload_date': '20220721', @@ -1007,10 +1015,10 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün The Friend Of YWAP', + 'uploader': 'Mün', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -1019,7 +1027,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670306984.0, }, }, { - # url to retweet id w/ legacy api + # retweeted_status (private) 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', @@ -1039,32 +1047,84 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, }, - 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, 'skip': 'Protected tweet', }, { - # orig tweet w/ graphql - 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', + # retweeted_status + 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009', 'info_dict': { - 'id': '1623274794488659969', - 'display_id': '1623739803874349067', + 'id': '1694928337846538240', 'ext': 'mp4', - 'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy', - 'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', - 'uploader': '@selfisekai@hackerspace.pl 🐀', - 'uploader_id': 'liberdalau', - 'uploader_url': 'https://twitter.com/liberdalau', + 'display_id': '1695424220702888009', + 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'description': 'md5:004f2d37fd58737724ec75bc7e679938', + 'uploader': 'Benny Johnson', + 'uploader_id': 'bennyjohnson', + 'uploader_url': 'https://twitter.com/bennyjohnson', 'age_limit': 0, 'tags': [], - 'duration': 8.033, - 'timestamp': 1675964711.0, - 'upload_date': '20230209', - 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', + 'duration': 45.001, + 'timestamp': 1692962814.0, + 'upload_date': '20230825', + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, - 'view_count': int, 'repost_count': int, + 'view_count': int, 'comment_count': int, }, - 'skip': 'Protected tweet', + }, { + # retweeted_status w/ legacy API + 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009', + 'info_dict': { + 'id': '1694928337846538240', + 'ext': 'mp4', + 'display_id': '1695424220702888009', + 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'description': 'md5:004f2d37fd58737724ec75bc7e679938', + 'uploader': 'Benny Johnson', + 'uploader_id': 'bennyjohnson', + 'uploader_url': 'https://twitter.com/bennyjohnson', + 'age_limit': 0, + 'tags': [], + 'duration': 45.001, + 'timestamp': 1692962814.0, + 'upload_date': '20230825', + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'like_count': int, + 'repost_count': int, + }, + 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, + }, { + # Broadcast embedded in tweet + 'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', + 'info_dict': { + 'id': '1yNGaNLjEblJj', + 'ext': 'mp4', + 'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', + 'uploader': 'Jessica Dobson', + 'uploader_id': '1DZEoDwDovRQa', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Animated gif and quote tweet video, with syndication API + 'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1696256659889565950', + 'title': 'BAKOON - https://t.co/zom968d0a0', + 'description': 'https://t.co/zom968d0a0', + 'tags': [], + 'uploader': 'BAKOON', + 'uploader_id': 'BAKKOOONN', + 'uploader_url': 'https://twitter.com/BAKKOOONN', + 'age_limit': 18, + 'timestamp': 1693254077.0, + 'upload_date': '20230828', + 'like_count': int, + }, + 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, + 'expected_warnings': ['Not all metadata'], }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1103,6 +1163,14 @@ class TwitterIE(TwitterBaseIE): 'only_matching': True, }] + _MEDIA_ID_RE = re.compile(r'_video/(\d+)/') + + @property + def _GRAPHQL_ENDPOINT(self): + if self.is_logged_in: + return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail' + return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' + def _graphql_to_legacy(self, data, twid): result = traverse_obj(data, ( 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', @@ -1130,9 +1198,14 @@ def _graphql_to_legacy(self, data, twid): 'user': ('core', 'user_results', 'result', 'legacy'), 'card': ('card', 'legacy'), 'quoted_status': ('quoted_status_result', 'result', 'legacy'), + 'retweeted_status': ('legacy', 'retweeted_status_result', 'result', 'legacy'), }, expected_type=dict, default={})) - # extra transformation is needed since result does not match legacy format + # extra transformations needed since result does not match legacy format + if status.get('retweeted_status'): + status['retweeted_status']['user'] = traverse_obj(status, ( + 'retweeted_status_result', 'result', 'core', 'user_results', 'result', 'legacy', {dict})) or {} + binding_values = { binding_value.get('key'): binding_value.get('value') for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict})) @@ -1208,33 +1281,42 @@ def _build_graphql_query(self, media_id): } def _extract_status(self, twid): - if self.is_logged_in: - return self._graphql_to_legacy( - self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + if self.is_logged_in or self._selected_api == 'graphql': + status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid) - try: - if not self._configuration_arg('legacy_api'): - return self._graphql_to_legacy( - self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) - return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { + elif self._selected_api == 'legacy': + status = self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, 'include_user_entities': 0, 'tweet_mode': 'extended', - }), 'retweeted_status', None) + }) - except ExtractorError as e: - if e.expected: - raise + elif self._selected_api == 'syndication': self.report_warning( - f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) + 'Not all metadata or media is available via syndication endpoint', twid, only_once=True) + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={ + 'id': twid, + # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') + 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)), + }) + if not status: + raise ExtractorError('Syndication endpoint returned empty JSON response') + # Transform the result so its structure matches that of legacy/graphql + media = [] + for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})): + detail['id_str'] = traverse_obj(detail, ( + 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid + media.append(detail) + status['extended_entities'] = {'media': media} - status = self._download_json( - 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', - headers={'User-Agent': 'Googlebot'}, query={'id': twid}) - status['extended_entities'] = {'media': status.get('mediaDetails')} - return status + else: + raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True) + + return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {} def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') @@ -1266,10 +1348,7 @@ def _real_extract(self, url): } def extract_from_video_info(media): - media_id = traverse_obj(media, 'id_str', 'id', ( - 'video_info', 'variants', ..., 'url', - {functools.partial(re.search, r'_video/(\d+)/')}, 1 - ), get_all=False, expected_type=str_or_none) or twid + media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) self.write_debug(f'Extracting from video info: {media_id}') formats = [] @@ -1503,6 +1582,8 @@ def _real_extract(self, url): broadcast = self._call_api( 'broadcasts/show.json', broadcast_id, {'ids': broadcast_id})['broadcasts'][broadcast_id] + if not broadcast: + raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) media_key = broadcast['media_key'] source = self._call_api( From 66cc64ff6696f9921ff112a278542f8d999ffea4 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Mon, 11 Sep 2023 09:51:39 -0500 Subject: [PATCH 358/501] [ie/zoom] Extract duration Closes #8080 Authored by: bashonly --- yt_dlp/extractor/zoom.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index 3d7ccca760..1e41d04349 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -127,6 +127,7 @@ def _real_extract(self, url): return { 'id': video_id, 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), + 'duration': int_or_none(data.get('duration')), 'subtitles': subtitles, 'formats': formats, 'http_headers': { From 7b71643cc986de9a3768dac4ac9b64f4d05e7f5e Mon Sep 17 00:00:00 2001 From: garret <garret1317@yandex.com> Date: Fri, 15 Sep 2023 18:18:51 +0100 Subject: [PATCH 359/501] [ie/mixcloud] Update API URL (#8114) Closes #8104 Authored by: garret1317 --- yt_dlp/extractor/mixcloud.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index fb5a08ca28..8a95d1a5db 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -20,7 +20,7 @@ class MixcloudBaseIE(InfoExtractor): def _call_api(self, object_type, object_fields, display_id, username, slug=None): lookup_key = object_type + 'Lookup' return self._download_json( - 'https://www.mixcloud.com/graphql', display_id, query={ + 'https://app.mixcloud.com/graphql', display_id, query={ 'query': '''{ %s(lookup: {username: "%s"%s}) { %s @@ -46,7 +46,15 @@ class MixcloudIE(MixcloudBaseIE): 'view_count': int, 'timestamp': 1321359578, 'upload_date': '20111115', + 'uploader_url': 'https://www.mixcloud.com/dholbach/', + 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', + 'duration': 3723, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { @@ -60,7 +68,14 @@ class MixcloudIE(MixcloudBaseIE): 'view_count': int, 'timestamp': 1422987057, 'upload_date': '20150203', + 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', + 'duration': 2992, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, }, + 'params': {'skip_download': '404 playback error on site'}, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'only_matching': True, @@ -259,9 +274,9 @@ def _real_extract(self, url): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue - slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + item_slug = try_get(cloudcast, lambda x: x['slug'], compat_str) owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) - video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None + video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None entries.append(self.url_result( cloudcast_url, MixcloudIE.ie_key(), video_id)) @@ -284,7 +299,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, }, { @@ -292,7 +307,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, }, { @@ -300,7 +315,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { # 'playlist_items': '1-100', @@ -323,9 +338,9 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'FirstEar_stream', 'title': 'First Ear (stream)', - 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', + 'description': 'we maraud for ears', }, - 'playlist_mincount': 271, + 'playlist_mincount': 269, }] _TITLE_KEY = 'displayName' From 497bbbbd7328cb705f70eced94dbd90993819a46 Mon Sep 17 00:00:00 2001 From: SevenLives <410355694@qq.com> Date: Sat, 16 Sep 2023 17:37:04 +0800 Subject: [PATCH 360/501] [ie/abematv] Fix proxy handling (#8046) Fixes https://github.com/yt-dlp/yt-dlp/issues/8036 Authored by: SevenLives --- yt_dlp/extractor/abematv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 163b83c6da..2a093580cc 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -12,7 +12,7 @@ import urllib.request import urllib.response import uuid - +from ..utils.networking import clean_proxies from .common import InfoExtractor from ..aes import aes_ecb_decrypt from ..utils import ( @@ -35,7 +35,10 @@ def add_opener(ydl, handler): # FIXME: Create proper API in .networking rh = ydl._request_director.handlers['Urllib'] if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: return - opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies) + headers = ydl.params['http_headers'].copy() + proxies = ydl.proxies.copy() + clean_proxies(proxies, headers) + opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies) assert isinstance(opener, urllib.request.OpenerDirector) opener.add_handler(handler) rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') From 578a82e497502b951036ce9da6fe0dac6937ac27 Mon Sep 17 00:00:00 2001 From: Kshitiz Gupta <Kshitiz305@live.com> Date: Sat, 16 Sep 2023 15:13:05 +0530 Subject: [PATCH 361/501] [ie/banbye] Support video ids containing a hyphen (#8059) Fixes https://github.com/yt-dlp/yt-dlp/issues/7895 Authored by: kshitiz305 --- yt_dlp/extractor/banbye.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index c873425656..e0fc93b973 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -31,7 +31,7 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', @@ -59,7 +59,27 @@ class BanByeIE(BanByeBaseIE): 'title': 'Krzysztof Karoń', 'id': 'p_Ld82N6gBw_OJ', }, - 'playlist_count': 9, + 'playlist_mincount': 9, + }, { + 'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD', + 'info_dict': { + 'id': 'v_kb6_o1Kyq-CD', + 'ext': 'mp4', + 'title': 'Co tak naprawdę dzieje się we Francji?! Czy Warszawa a potem cała Polska będzie drugim Paryżem?!🤔🇵🇱', + 'description': 'md5:82be4c0e13eae8ea1ca8b9f2e07226a8', + 'uploader': 'Marcin Rola - MOIM ZDANIEM!🇵🇱', + 'channel_id': 'ch_QgWnHvDG2fo5', + 'channel_url': 'https://banbye.com/channel/ch_QgWnHvDG2fo5', + 'duration': 597, + 'timestamp': 1688642656, + 'upload_date': '20230706', + 'thumbnail': 'https://cdn.banbye.com/video/v_kb6_o1Kyq-CD/96.webp', + 'tags': ['Paryż', 'Francja', 'Polska', 'Imigranci', 'Morawiecki', 'Tusk'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, }] def _real_extract(self, url): From aee6b9b88c0bcccf27fd23b7e00fc0b7b168928f Mon Sep 17 00:00:00 2001 From: barsnick <barsnick@users.noreply.github.com> Date: Sat, 16 Sep 2023 12:04:08 +0200 Subject: [PATCH 362/501] [ie/Axs] Add extractor (#8094) Authored by: barsnick --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/axs.py | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 yt_dlp/extractor/axs.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b788737a2d..b836fe8a3d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -165,6 +165,7 @@ AWAANLiveIE, AWAANSeasonIE, ) +from .axs import AxsIE from .azmedien import AZMedienIE from .baidu import BaiduVideoIE from .banbye import ( diff --git a/yt_dlp/extractor/axs.py b/yt_dlp/extractor/axs.py new file mode 100644 index 0000000000..4b263725f1 --- /dev/null +++ b/yt_dlp/extractor/axs.py @@ -0,0 +1,87 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class AxsIE(InfoExtractor): + IE_NAME = 'axs.tv' + _VALID_URL = r'https?://(?:www\.)?axs\.tv/(?:channel/(?:[^/?#]+/)+)?video/(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'https://www.axs.tv/video/5f4dc776b70e4f1c194f22ef/', + 'md5': '8d97736ae8e50c64df528e5e676778cf', + 'info_dict': { + 'id': '5f4dc776b70e4f1c194f22ef', + 'title': 'Small Town', + 'ext': 'mp4', + 'description': 'md5:e314d28bfaa227a4d7ec965fae19997f', + 'upload_date': '20230602', + 'timestamp': 1685729564, + 'duration': 1284.216, + 'series': 'Rock & Roll Road Trip with Sammy Hagar', + 'season': 2, + 'episode': '3', + 'thumbnail': 'https://images.dotstudiopro.com/5f4e9d330a0c3b295a7e8394', + }, + }, { + 'url': 'https://www.axs.tv/channel/rock-star-interview/video/daryl-hall', + 'md5': '300ae795cd8f9984652c0949734ffbdc', + 'info_dict': { + 'id': '5f488148b70e4f392572977c', + 'display_id': 'daryl-hall', + 'title': 'Daryl Hall', + 'ext': 'mp4', + 'description': 'md5:e54ecaa0f4b5683fc9259e9e4b196628', + 'upload_date': '20230214', + 'timestamp': 1676403615, + 'duration': 2570.668, + 'series': 'The Big Interview with Dan Rather', + 'season': 3, + 'episode': '5', + 'thumbnail': 'https://images.dotstudiopro.com/5f4d1901f340b50d937cec32', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_json_data = self._search_json( + r'mountObj\s*=', webpage, 'video ID data', display_id, + transform_source=js_to_json) + video_id = webpage_json_data['video_id'] + company_id = webpage_json_data['company_id'] + + meta = self._download_json( + f'https://api.myspotlight.tv/dotplayer/video/{company_id}/{video_id}', + video_id, query={'device_type': 'desktop_web'})['video'] + + formats = self._extract_m3u8_formats( + meta['video_m3u8'], video_id, 'mp4', m3u8_id='hls') + + subtitles = {} + for cc in traverse_obj(meta, ('closeCaption', lambda _, v: url_or_none(v['srtPath']))): + subtitles.setdefault(cc.get('srtShortLang') or 'en', []).append( + {'ext': cc.get('srtExt'), 'url': cc['srtPath']}) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + **traverse_obj(meta, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'series': ('seriestitle', {str}), + 'season': ('season', {int}), + 'episode': ('episode', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('updated_at', {parse_iso8601}), + 'thumbnail': ('thumb', {url_or_none}), + }), + 'subtitles': subtitles, + } From 6e07e4bc7e59f5bdb60e93c011e57b18b009f2b5 Mon Sep 17 00:00:00 2001 From: zhallgato <zhallgato@gmail.com> Date: Sat, 16 Sep 2023 12:12:18 +0200 Subject: [PATCH 363/501] [ie/mediaklikk] Fix extractor (#8086) Fixes https://github.com/yt-dlp/yt-dlp/issues/8053 Authored by: bashonly, zhallgato --- yt_dlp/extractor/mediaklikk.py | 72 ++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index 46365081b7..fcc4827b5c 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -1,5 +1,8 @@ from ..utils import ( - unified_strdate + ExtractorError, + traverse_obj, + unified_strdate, + url_or_none, ) from .common import InfoExtractor from ..compat import ( @@ -15,7 +18,7 @@ class MediaKlikkIE(InfoExtractor): (?P<id>[^/#?_]+)''' _TESTS = [{ - # mediaklikk. date in html. + # (old) mediaklikk. date in html. 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'info_dict': { 'id': '4754129', @@ -23,9 +26,21 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20210901', 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' + }, + 'skip': 'Webpage redirects to 404 page', + }, { + # mediaklikk. date in html. + 'url': 'https://mediaklikk.hu/video/hazajaro-fabova-hegyseg-kishont-koronaja/', + 'info_dict': { + 'id': '6696133', + 'title': 'Hazajáró, Fabova-hegység - Kishont koronája', + 'display_id': 'hazajaro-fabova-hegyseg-kishont-koronaja', + 'ext': 'mp4', + 'upload_date': '20230903', + 'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' } }, { - # m4sport + # (old) m4sport 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', 'info_dict': { 'id': '4754999', @@ -33,6 +48,18 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20210830', 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg' + }, + 'skip': 'Webpage redirects to 404 page', + }, { + # m4sport + 'url': 'https://m4sport.hu/sportkozvetitesek/video/2023/09/08/atletika-gyemant-liga-brusszel/', + 'info_dict': { + 'id': '6711136', + 'title': 'Atlétika – Gyémánt Liga, Brüsszel', + 'display_id': 'atletika-gyemant-liga-brusszel', + 'ext': 'mp4', + 'upload_date': '20230908', + 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg' } }, { # m4sport with *video/ url and no date @@ -40,20 +67,33 @@ class MediaKlikkIE(InfoExtractor): 'info_dict': { 'id': '4492099', 'title': 'Real Madrid - Chelsea 1-1', + 'display_id': 'real-madrid-chelsea-1-1', 'ext': 'mp4', - 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' + 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' } }, { - # hirado + # (old) hirado 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', 'info_dict': { 'id': '4760120', 'title': 'Feltételeket szabott a főváros', 'ext': 'mp4', 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg' + }, + 'skip': 'Webpage redirects to video list page', + }, { + # hirado + 'url': 'https://hirado.hu/belfold/video/2023/09/11/marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal', + 'info_dict': { + 'id': '6716068', + 'title': 'Marad az éves elszámolás a napelemekre beruházó családoknál', + 'display_id': 'marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal', + 'ext': 'mp4', + 'upload_date': '20230911', + 'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg' } }, { - # petofilive + # (old) petofilive 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', 'info_dict': { 'id': '4571948', @@ -61,6 +101,18 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20210607', 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg' + }, + 'skip': 'Webpage redirects to empty page', + }, { + # petofilive + 'url': 'https://petofilive.hu/video/2023/09/09/futball-fesztival-a-margitszigeten/', + 'info_dict': { + 'id': '6713233', + 'title': 'Futball Fesztivál a Margitszigeten', + 'display_id': 'futball-fesztival-a-margitszigeten', + 'ext': 'mp4', + 'upload_date': '20230909', + 'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg' } }] @@ -84,8 +136,12 @@ def _real_extract(self, url): player_data['video'] = player_data.pop('token') player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) - playlist_url = self._proto_relative_url(compat_urllib_parse_unquote( - self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/')) + player_json = self._search_json( + r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);') + playlist_url = traverse_obj( + player_json, ('playlist', lambda _, v: v['type'] == 'hls', 'file', {url_or_none}), get_all=False) + if not playlist_url: + raise ExtractorError('Unable to extract playlist url') formats = self._extract_wowza_formats( playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) From 98eac0e6ba0e510ae7dfdfd249d42ee71fb272b1 Mon Sep 17 00:00:00 2001 From: hatsomatt <143712404+hatsomatt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:02:37 +0200 Subject: [PATCH 364/501] [ie/videa] Fix extraction (#8003) Closes #7427 Authored by: hatsomatt, aky-01 Co-authored-by: aky-01 <65510015+aky-01@users.noreply.github.com> --- yt_dlp/extractor/videa.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 59ae933b08..634d2edea6 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -38,6 +38,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', @@ -48,6 +49,7 @@ class VideaIE(InfoExtractor): 'title': 'Supercars előzés', 'thumbnail': r're:^https?://.*', 'duration': 64, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', @@ -58,6 +60,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', @@ -124,7 +127,7 @@ def _real_extract(self, url): query['_t'] = result[:16] b64_info, handle = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + 'http://videa.hu/player/xml', video_id, query=query) if b64_info.startswith('<?xml'): info = self._parse_xml(b64_info, video_id) else: From 7d3d658f4c558ee7d72b1c01b46f2126948681cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Duval?= <jerome.duval@gmail.com> Date: Sat, 16 Sep 2023 16:24:11 +0200 Subject: [PATCH 365/501] [ie/TV5MondePlus] Fix extractor (#7952) Closes #4978 Authored by: korli, dirkf --- yt_dlp/extractor/tv5mondeplus.py | 98 ++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py index bd0be784d2..4da1b26d1a 100644 --- a/yt_dlp/extractor/tv5mondeplus.py +++ b/yt_dlp/extractor/tv5mondeplus.py @@ -1,10 +1,14 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( determine_ext, extract_attributes, int_or_none, parse_duration, + traverse_obj, try_get, + url_or_none, ) @@ -12,6 +16,36 @@ class TV5MondePlusIE(InfoExtractor): IE_DESC = 'TV5MONDE+' _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' _TESTS = [{ + # movie + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/les-novices', + 'md5': 'c86f60bf8b75436455b1b205f9745955', + 'info_dict': { + 'id': 'ZX0ipMyFQq_6D4BA7b', + 'display_id': 'les-novices', + 'ext': 'mp4', + 'title': 'Les novices', + 'description': 'md5:2e7c33ba3ad48dabfcc2a956b88bde2b', + 'upload_date': '20230821', + 'thumbnail': 'https://revoir.tv5monde.com/uploads/media/video_thumbnail/0738/60/01e952b7ccf36b7c6007ec9131588954ab651de9.jpeg', + 'duration': 5177, + 'episode': 'Les novices', + }, + }, { + # series episode + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/opj-les-dents-de-la-terre-2', + 'info_dict': { + 'id': 'wJ0eeEPozr_6D4BA7b', + 'display_id': 'opj-les-dents-de-la-terre-2', + 'ext': 'mp4', + 'title': "OPJ - Les dents de la Terre (2)", + 'description': 'md5:288f87fd68d993f814e66e60e5302d9d', + 'upload_date': '20230823', + 'series': 'OPJ', + 'episode': 'Les dents de la Terre (2)', + 'duration': 2877, + 'thumbnail': 'https://dl-revoir.tv5monde.com/images/1a/5753448.jpg' + }, + }, { # movie 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent', 'md5': '32fa0cde16a4480d1251502a66856d5f', @@ -23,6 +57,7 @@ class TV5MondePlusIE(InfoExtractor): 'description': 'md5:570e8bb688036ace873b2d50d24c026d', 'upload_date': '20210819', }, + 'skip': 'no longer available', }, { # series episode 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice', @@ -39,6 +74,7 @@ class TV5MondePlusIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'no longer available', }, { 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', 'only_matching': True, @@ -63,20 +99,45 @@ def _real_extract(self, url): video_files = self._parse_json( vpl_data['data-broadcast'], display_id) formats = [] - for video_file in video_files: - v_url = video_file.get('url') - if not v_url: - continue - video_format = video_file.get('format') or determine_ext(v_url) - if video_format == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': v_url, - 'format_id': video_format, - }) + video_id = None + + def process_video_files(v): + nonlocal video_id + for video_file in v: + v_url = video_file.get('url') + if not v_url: + continue + if video_file.get('type') == 'application/deferred': + d_param = urllib.parse.quote(v_url) + token = video_file.get('token') + if not token: + continue + deferred_json = self._download_json( + f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id, + note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False) + v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none})) + if not v_url: + continue + # data-guid from the webpage isn't stable, use the material id from the json urls + video_id = self._search_regex( + r'materials/([\da-zA-Z]{10}_[\da-fA-F]{7})/', v_url, 'video id', default=None) + process_video_files(deferred_json) + + video_format = video_file.get('format') or determine_ext(v_url) + if video_format == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif video_format == 'mpd': + formats.extend(self._extract_mpd_formats( + v_url, display_id, fatal=False)) + else: + formats.append({ + 'url': v_url, + 'format_id': video_format, + }) + + process_video_files(video_files) metadata = self._parse_json( vpl_data['data-metadata'], display_id) @@ -100,10 +161,11 @@ def _real_extract(self, url): if upload_date: upload_date = upload_date.replace('_', '') - video_id = self._search_regex( - (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', - default=display_id) + if not video_id: + video_id = self._search_regex( + (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', + default=display_id) return { 'id': video_id, From f659e6439444ac64305b5c80688cd82f59d2279c Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sat, 16 Sep 2023 17:50:06 +0200 Subject: [PATCH 366/501] [ie/bpb] Overhaul extractor (#8119) Authored by: Grub4K --- yt_dlp/extractor/bpb.py | 174 +++++++++++++++++++++++++++++++++------- yt_dlp/utils/_utils.py | 1 + 2 files changed, 145 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index f28e581b87..7fe0899449 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -1,56 +1,170 @@ +import functools import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, + get_element_text_and_html_by_tag, + get_elements_by_class, + join_nonempty, js_to_json, - determine_ext, + mimetype2ext, + unified_strdate, + url_or_none, + urljoin, + variadic, ) +from ..utils.traversal import traverse_obj + + +def html_get_element(tag=None, cls=None): + assert tag or cls, 'One of tag or class is required' + + if cls: + func = functools.partial(get_elements_by_class, cls, tag=tag) + else: + func = functools.partial(get_element_text_and_html_by_tag, tag) + + def html_get_element_wrapper(html): + return variadic(func(html))[0] + + return html_get_element_wrapper class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P<id>\d+)(?:[/?#]|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', + 'creator': 'Kooperative Berlin', + 'description': 'md5:f4f75885ba009d3e2b156247a8941ce6', + 'release_date': '20160115', + 'series': 'Interview auf dem Geschichtsforum 1989 | 2009', + 'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', - 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/', + 'info_dict': { + 'id': '522184', + 'ext': 'mp4', + 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'description': 'md5:f83c795ff8f825a69456a9e51fc15903', + 'release_date': '20230621', + 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], + 'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB', + 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/', + 'info_dict': { + 'id': '518789', + 'ext': 'mp4', + 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8', + 'release_date': '20230302', + 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], + 'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D', + 'title': 'md5:3e956f264bb501f6383f10495a401da4', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/', + 'only_matching': True, + }, { + 'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/', + 'info_dict': { + 'id': '315813', + 'ext': 'mp3', + 'creator': 'Axel Schröder', + 'description': 'md5:eda9d1af34e5912efef5baf54fba4427', + 'release_date': '20200921', + 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager', + 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94', + 'title': 'Folge 1: Eine Einführung', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/', + 'info_dict': { + 'id': '517806', + 'ext': 'mp3', + 'creator': 'Bundeszentrale für politische Bildung', + 'description': 'md5:594689600e919912aade0b2871cc3fed', + 'release_date': '20230127', + 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"', + 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0', + 'title': 'Die Weltanschauung der "Neuen Rechten"', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/', + 'only_matching': True, + }] + + _TITLE_RE = re.compile('(?P<title>[^<]*)<[^>]+>(?P<series>[^<]*)') + + def _parse_vue_attributes(self, name, string, video_id): + attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name)) + + for key, value in attributes.items(): + if key.startswith(':'): + attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False) + + return attributes + + @staticmethod + def _process_source(source): + url = url_or_none(source['src']) + if not url: + return None + + source_type = source.get('type', '') + extension = mimetype2ext(source_type) + is_video = source_type.startswith('video') + note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None + + return { + 'url': url, + 'ext': extension, + 'vcodec': None if is_video else 'none', + 'quality': 10 if note == 'high' else 0, + 'format_note': note, + 'format_id': join_nonempty(extension, note), } - } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<h2 class="white">(.*?)</h2>', webpage, 'title') - video_info_dicts = re.findall( - r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) - - formats = [] - for video_info in video_info_dicts: - video_info = self._parse_json( - video_info, video_id, transform_source=js_to_json, fatal=False) - if not video_info: - continue - video_url = video_info.get('src') - if not video_url: - continue - quality = 'high' if '_high' in video_url else 'low' - formats.append({ - 'url': video_url, - 'quality': 10 if quality == 'high' else 0, - 'format_note': quality, - 'format_id': '%s-%s' % (quality, determine_ext(video_url)), - }) + title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match})) + json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False)) return { 'id': video_id, - 'formats': formats, - 'title': title, - 'description': self._og_search_description(webpage), + 'title': traverse_obj(title_result, ('title', {str.strip})) or None, + # This metadata could be interpreted otherwise, but it fits "series" the most + 'series': traverse_obj(title_result, ('series', {str.strip})) or None, + 'description': join_nonempty(*traverse_obj(webpage, [( + {html_get_element(cls='opening-intro')}, + [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}], + ), {clean_html}]), delim='\n\n') or None, + 'creator': self._html_search_meta('author', webpage), + 'uploader': self._html_search_meta('publisher', webpage), + 'release_date': unified_strdate(self._html_search_meta('date', webpage)), + 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)), + **traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), { + 'formats': (':sources', ..., {self._process_source}), + 'thumbnail': ('poster', {lambda x: urljoin(url, x)}), + }), } diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index f5552ce802..180bec245a 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2847,6 +2847,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'quicktime': 'mov', 'webm': 'webm', 'vp9': 'vp9', + 'video/ogg': 'ogv', 'x-flv': 'flv', 'x-m4v': 'm4v', 'x-matroska': 'mkv', From 069cbece9dba6384f1cc5fcfc7ce562a31af42fc Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 13:28:14 -0500 Subject: [PATCH 367/501] [ie/tiktok] Fix webpage extraction Closes #8089 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index f14c4f9d6a..f26972cff2 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -15,7 +15,6 @@ UserNotLive, determine_ext, format_field, - get_element_by_id, get_first, int_or_none, join_nonempty, @@ -50,8 +49,9 @@ def _create_url(user_id, video_id): return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' def _get_sigi_state(self, webpage, display_id): - return self._parse_json(get_element_by_id( - 'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id) + return self._search_json( + r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage, + 'sigi state', display_id, end_pattern=r'</script>') def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): From cebbd33b1c678149fc8f0e254db6fc0da317ea80 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:43:12 -0400 Subject: [PATCH 368/501] [ie/twitcasting] Improve `_VALID_URL` (#8120) Closes #7597 Authored by: c-basalt --- yt_dlp/extractor/twitcasting.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index dff353a4f9..3890d5d8fb 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -22,7 +22,7 @@ class TwitCastingIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<uploader_id>[^/?#]+)/(?:movie|twplayer)/(?P<id>\d+)' _M3U8_HEADERS = { 'Origin': 'https://twitcasting.tv', 'Referer': 'https://twitcasting.tv/', @@ -231,7 +231,7 @@ def find_dmu(x): class TwitCastingLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://twitcasting.tv/ivetesangalo', 'only_matching': True, @@ -265,8 +265,15 @@ def _real_extract(self, url): class TwitCastingUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/show/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/(:?show|archive)/?(?:[#?]|$)' _TESTS = [{ + 'url': 'https://twitcasting.tv/natsuiromatsuri/archive/', + 'info_dict': { + 'id': 'natsuiromatsuri', + 'title': 'natsuiromatsuri - Live History', + }, + 'playlist_mincount': 235, + }, { 'url': 'https://twitcasting.tv/noriyukicas/show', 'only_matching': True, }] From 9bf14be775289bd88cc1f5c89fd761ae51879484 Mon Sep 17 00:00:00 2001 From: makeworld <25111343+makew0rld@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:49:43 -0400 Subject: [PATCH 369/501] [ie/cbc] Ignore any 426 from API (#7689) Closes #7477 Authored by: makew0rld --- yt_dlp/extractor/cbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index b3c5471f7b..2920b9027d 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -339,12 +339,12 @@ def _new_claims_token(self, email, password): data = json.dumps({'jwt': sig}).encode() headers = {'content-type': 'application/json', 'ott-device-type': 'web'} resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', - None, data=data, headers=headers) + None, data=data, headers=headers, expected_status=426) cbc_access_token = resp['accessToken'] headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', - None, headers=headers) + None, headers=headers, expected_status=426) return resp['claimsToken'] def _get_claims_token_expiry(self): From 5336bf57a7061e0955a37f0542fc8ebf50d55b17 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:53:57 -0400 Subject: [PATCH 370/501] [ie/bilibili] Extract `format_id` (#7555) Authored by: c-basalt --- yt_dlp/extractor/bilibili.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index cb7ab2a174..290340078c 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -3,6 +3,7 @@ import hashlib import itertools import math +import re import time import urllib.parse @@ -38,6 +39,8 @@ class BilibiliBaseIE(InfoExtractor): + _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') + def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') @@ -54,7 +57,8 @@ def extract_formats(self, play_info): 'acodec': audio.get('codecs'), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), - 'filesize': int_or_none(audio.get('size')) + 'filesize': int_or_none(audio.get('size')), + 'format_id': str_or_none(audio.get('id')), } for audio in audios] formats.extend({ @@ -68,6 +72,9 @@ def extract_formats(self, play_info): 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), + 'format_id': traverse_obj( + video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1), + ('id', {str_or_none}), get_all=False), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) From 9d376c4daeaf1279a011582f3f0e6ae42af520dd Mon Sep 17 00:00:00 2001 From: Aniruddh Joshi <aniruddh@ebincoweb.com> Date: Sun, 17 Sep 2023 02:28:21 +0530 Subject: [PATCH 371/501] [ie/AmazonMiniTV] Fix extractor (#8103) Closes #7817 Authored by: Aniruddh-J --- yt_dlp/extractor/amazonminitv.py | 63 +++++--------------------------- 1 file changed, 9 insertions(+), 54 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index b57d985d10..ad23b16bd6 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -37,7 +37,7 @@ def _call_api(self, asin, data=None, note=None): return resp['data'][data['operationName']] -class AmazonMiniTVIE(AmazonMiniTVBaseIE): +class AmazonMiniTVIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P<id>[a-f0-9-]+)' _TESTS = [{ 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', @@ -86,56 +86,14 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE): 'only_matching': True, }] - _GRAPHQL_QUERY_CONTENT = ''' -query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { - content( - applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} - contentId: $contentId - contentType: $contentType - ) { - contentId - name - ... on Episode { - contentId - vodType - name - images - description { - synopsis - contentLengthInSeconds - } - publicReleaseDateUTC - audioTracks - seasonId - seriesId - seriesName - seasonNumber - episodeNumber - timecode { - endCreditsTime - } - } - ... on MovieContent { - contentId - vodType - name - description { - synopsis - contentLengthInSeconds - } - images - publicReleaseDateUTC - audioTracks - } - } -}''' - def _real_extract(self, url): - asin = f'amzn1.dv.gti.{self._match_id(url)}' - prs = self._call_api(asin, note='Downloading playback info') + video_uuid = self._match_id(url) + asin = f'amzn1.dv.gti.{video_uuid}' + webpage = self._download_webpage(f'https://www.amazon.in/minitv/tp/{video_uuid}', asin) + data = self._search_nextjs_data(webpage, asin)['props']['pageProps']['ssrProps'] formats, subtitles = [], {} - for type_, asset in prs['playbackAssets'].items(): + for type_, asset in traverse_obj(data, ('playbackData', 'playbackAssets', {dict.items}, ...)): if not traverse_obj(asset, 'manifestUrl'): continue if type_ == 'hls': @@ -152,12 +110,7 @@ def _real_extract(self, url): else: self.report_warning(f'Unknown asset type: {type_}') - title_info = self._call_api( - asin, note='Downloading title info', data={ - 'operationName': 'content', - 'variables': {'contentId': asin}, - 'query': self._GRAPHQL_QUERY_CONTENT, - }) + title_info = traverse_obj(data, ('contentData', {dict})) or {} credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) is_episode = title_info.get('vodType') == 'EPISODE' @@ -192,6 +145,7 @@ class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' + _WORKING = False _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -251,6 +205,7 @@ class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' + _WORKING = False _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, From a83da3717d30697102e76f63a6f29d77f9373c2a Mon Sep 17 00:00:00 2001 From: ApoorvShah111 <79164543+ApoorvShah111@users.noreply.github.com> Date: Sun, 17 Sep 2023 02:31:26 +0530 Subject: [PATCH 372/501] [ie/nitter] Fix title extraction fallback (#8102) Closes #7575 Authored by: ApoorvShah111 --- yt_dlp/extractor/nitter.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/nitter.py b/yt_dlp/extractor/nitter.py index 5d1ca1f5d0..35d1311dcd 100644 --- a/yt_dlp/extractor/nitter.py +++ b/yt_dlp/extractor/nitter.py @@ -265,6 +265,26 @@ class NitterIE(InfoExtractor): 'repost_count': int, 'comment_count': int, } + }, { # no OpenGraph title + 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m', + 'info_dict': { + 'id': '1678455464038735895', + 'ext': 'mp4', + 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?', + 'description': 'Local man, what did Romanians ever do to you?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Your Typical Local Man', + 'uploader_id': 'LocalBateman', + 'uploader_url': f'https://{current_instance}/LocalBateman', + 'upload_date': '20230710', + 'timestamp': 1689009900, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': {'skip_download': 'm3u8'}, } ] @@ -292,7 +312,7 @@ def _real_extract(self, url): 'ext': ext }] - title = description = self._og_search_description(full_webpage) or self._html_search_regex( + title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex( r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False) uploader_id = self._html_search_regex( From ecef42c3adbcb6a84405139047923c4967316f28 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Sun, 17 Sep 2023 05:04:10 +0800 Subject: [PATCH 373/501] [ie/zaiko] Improve thumbnail extraction (#8054) Authored by: pzhlkj6612 --- yt_dlp/extractor/zaiko.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index 0ccacbb6aa..2b6221da21 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -9,6 +9,7 @@ traverse_obj, try_call, unescapeHTML, + url_basename, url_or_none, ) @@ -45,12 +46,14 @@ class ZaikoIE(ZaikoBaseIE): 'uploader_id': '454', 'uploader': 'ZAIKO ZERO', 'release_timestamp': 1583809200, - 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', + 'thumbnail': r're:^https://[\w.-]+/\w+/\w+', + 'thumbnails': 'maxcount:2', 'release_date': '20200310', 'categories': ['Tech House'], 'live_status': 'was_live', }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'Your account does not have tickets to this event', }] def _real_extract(self, url): @@ -83,6 +86,12 @@ def _real_extract(self, url): if not formats: self.raise_no_formats(msg, expected=expected) + thumbnail_urls = [ + traverse_obj(player_meta, ('initial_event_info', 'poster_url')), + self._og_search_thumbnail(self._download_webpage( + f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''), + ] + return { 'id': video_id, 'formats': formats, @@ -96,8 +105,8 @@ def _real_extract(self, url): }), **traverse_obj(player_meta, ('initial_event_info', { 'alt_title': ('title', {str}), - 'thumbnail': ('poster_url', {url_or_none}), })), + 'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)] } From 0ce1f48bf1cb78d40d734ce73ee1c90eccf92274 Mon Sep 17 00:00:00 2001 From: 04-pasha-04 <89145825+04-pasha-04@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:06:00 +0200 Subject: [PATCH 374/501] [ie/funker530] Fix extraction (#8040) Authored by: 04-pasha-04 --- yt_dlp/extractor/funker530.py | 1 + yt_dlp/extractor/rumble.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py index ba5ab7d4ee..62fd7f6dda 100644 --- a/yt_dlp/extractor/funker530.py +++ b/yt_dlp/extractor/funker530.py @@ -60,6 +60,7 @@ class Funker530IE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + info = {} rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) if rumble_url: info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index f8bf4a1825..96c192581d 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] + r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{[^}]*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) From 23d829a3420450bcfb0788e6fb2cf4f6acdbe596 Mon Sep 17 00:00:00 2001 From: Tristan Lee <lee.tristan.evans@gmail.com> Date: Sat, 16 Sep 2023 16:08:15 -0500 Subject: [PATCH 375/501] [ie/Rumble] Fix embed extraction (#8035) Authored by: trislee --- yt_dlp/extractor/rumble.py | 59 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 96c192581d..85567d9a22 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -33,7 +33,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', 'channel_url': 'https://rumble.com/c/WMAR', 'channel': 'WMAR', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', 'duration': 234, 'uploader': 'WMAR', 'live_status': 'not_live', @@ -84,7 +84,7 @@ class RumbleEmbedIE(InfoExtractor): 'info_dict': { 'id': 'v1essrt', 'ext': 'mp4', - 'title': 'startswith:lofi hip hop radio - beats to relax/study', + 'title': 'startswith:lofi hip hop radio 📚 - beats to relax/study to', 'timestamp': 1661519399, 'upload_date': '20220826', 'channel_url': 'https://rumble.com/c/LofiGirl', @@ -99,7 +99,7 @@ class RumbleEmbedIE(InfoExtractor): 'url': 'https://rumble.com/embed/v1amumr', 'info_dict': { 'id': 'v1amumr', - 'ext': 'webm', + 'ext': 'mp4', 'fps': 60, 'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live', 'timestamp': 1658518457, @@ -129,7 +129,7 @@ class RumbleEmbedIE(InfoExtractor): 'duration': 92, 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', 'channel_url': 'https://rumble.com/c/RichSementa', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.qR4e-small-911-Audio-From-The-Man-Who-.jpg', 'timestamp': 1654892716, 'uploader': 'Mr Producer Media', 'upload_date': '20220610', @@ -236,7 +236,9 @@ def _real_extract(self, url): class RumbleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$' - _EMBED_REGEX = [r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>'] + _EMBED_REGEX = [ + r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>', + r'<a[^>]+class="videostream__link link"[^>]+href=(?P<url>/v[\w.-]+\.html)[^>]*>'] _TESTS = [{ 'add_ie': ['RumbleEmbed'], 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', @@ -254,6 +256,7 @@ class RumbleIE(InfoExtractor): 'thumbnail': r're:https://.+\.jpg', 'duration': 103, 'like_count': int, + 'dislike_count': int, 'view_count': int, 'live_status': 'not_live', } @@ -278,6 +281,9 @@ class RumbleIE(InfoExtractor): 'channel_url': 'https://rumble.com/c/Redacted', 'live_status': 'not_live', 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, }, }, { 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html', @@ -296,12 +302,15 @@ class RumbleIE(InfoExtractor): 'channel_url': 'https://rumble.com/c/KimIversen', 'channel': 'Kim Iversen', 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, }, }] _WEBPAGE_TESTS = [{ 'url': 'https://rumble.com/videos?page=2', - 'playlist_count': 25, + 'playlist_mincount': 24, 'info_dict': { 'id': 'videos?page=2', 'title': 'All videos', @@ -309,17 +318,16 @@ class RumbleIE(InfoExtractor): 'age_limit': 0, }, }, { - 'url': 'https://rumble.com/live-videos', - 'playlist_mincount': 19, + 'url': 'https://rumble.com/browse/live', + 'playlist_mincount': 25, 'info_dict': { - 'id': 'live-videos', - 'title': 'Live Videos', - 'description': 'Live videos on Rumble.com', + 'id': 'live', + 'title': 'Browse', 'age_limit': 0, }, }, { 'url': 'https://rumble.com/search/video?q=rumble&sort=views', - 'playlist_count': 24, + 'playlist_mincount': 24, 'info_dict': { 'id': 'video?q=rumble&sort=views', 'title': 'Search results for: rumble', @@ -334,19 +342,20 @@ def _real_extract(self, url): if not url_info: raise UnsupportedError(url) - release_ts_str = self._search_regex( - r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', - webpage, 'release date', fatal=False, default=None) - view_count_str = self._search_regex(r'<span class="media-heading-info">([\d,]+) Views', - webpage, 'view count', fatal=False, default=None) - - return self.url_result( - url_info['url'], ie_key=url_info['ie_key'], url_transparent=True, - view_count=parse_count(view_count_str), - release_timestamp=parse_iso8601(release_ts_str), - like_count=parse_count(get_element_by_class('rumbles-count', webpage)), - description=clean_html(get_element_by_class('media-description', webpage)), - ) + return { + '_type': 'url_transparent', + 'ie_key': url_info['ie_key'], + 'url': url_info['url'], + 'release_timestamp': parse_iso8601(self._search_regex( + r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', webpage, 'release date', default=None)), + 'view_count': int_or_none(self._search_regex( + r'"userInteractionCount"\s*:\s*(\d+)', webpage, 'view count', default=None)), + 'like_count': parse_count(self._search_regex( + r'<span data-js="rumbles_up_votes">\s*([\d,.KM]+)', webpage, 'like count', default=None)), + 'dislike_count': parse_count(self._search_regex( + r'<span data-js="rumbles_down_votes">\s*([\d,.KM]+)', webpage, 'dislike count', default=None)), + 'description': clean_html(get_element_by_class('media-description', webpage)) + } class RumbleChannelIE(InfoExtractor): From b4c1c408c63724339eb12b16c91b253a7ee62cfa Mon Sep 17 00:00:00 2001 From: barsnick <barsnick@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:11:05 +0200 Subject: [PATCH 376/501] [ie/Bild.de] Extract HLS formats (#8032) Closes #7951 Authored by: barsnick --- yt_dlp/extractor/bild.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/bild.py b/yt_dlp/extractor/bild.py index f3dea33c46..eb289329d8 100644 --- a/yt_dlp/extractor/bild.py +++ b/yt_dlp/extractor/bild.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + traverse_obj, unescapeHTML, ) @@ -8,7 +9,8 @@ class BildIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' IE_DESC = 'Bild.de' - _TEST = { + _TESTS = [{ + 'note': 'static MP4 only', 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', 'md5': 'dd495cbd99f2413502a1713a1156ac8a', 'info_dict': { @@ -19,7 +21,19 @@ class BildIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 196, } - } + }, { + 'note': 'static MP4 and HLS', + 'url': 'https://www.bild.de/video/clip/news-ausland/deftiger-abgang-vom-10m-turm-bademeister-sorgt-fuer-skandal-85158620.bild.html', + 'md5': 'fb0ed4f09c495d4ba7ce2eee0bb90de1', + 'info_dict': { + 'id': '85158620', + 'ext': 'mp4', + 'title': 'Der Sprungturm-Skandal', + 'description': 'md5:709b543c24dc31bbbffee73bccda34ad', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 69, + } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -27,11 +41,23 @@ def _real_extract(self, url): video_data = self._download_json( url.split('.bild.html')[0] + ',view=json.bild.html', video_id) + formats = [] + for src in traverse_obj(video_data, ('clipList', 0, 'srces', lambda _, v: v['src'])): + src_type = src.get('type') + if src_type == 'application/x-mpegURL': + formats.extend( + self._extract_m3u8_formats( + src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif src_type == 'video/mp4': + formats.append({'url': src['src'], 'format_id': 'http-mp4'}) + else: + self.report_warning(f'Skipping unsupported format type: "{src_type}"') + return { 'id': video_id, 'title': unescapeHTML(video_data['title']).strip(), 'description': unescapeHTML(video_data.get('description')), - 'url': video_data['clipList'][0]['srces'][0]['src'], + 'formats': formats, 'thumbnail': video_data.get('poster'), 'duration': int_or_none(video_data.get('durationSec')), } From 5be7e978867b5f66ad6786c674d79d40e950ae16 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 16 Sep 2023 17:13:04 -0400 Subject: [PATCH 377/501] [ie/sohu] Fix extractor (#7628) Closes #1667, Closes #7463 Authored by: c-basalt, bashonly --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/sohu.py | 107 ++++++++++++++++++++++++++++++-- 2 files changed, 105 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b836fe8a3d..4fed6d66a2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1795,7 +1795,10 @@ from .slutload import SlutloadIE from .smotrim import SmotrimIE from .snotr import SnotrIE -from .sohu import SohuIE +from .sohu import ( + SohuIE, + SohuVIE, +) from .sonyliv import ( SonyLIVIE, SonyLIVSeriesIE, diff --git a/yt_dlp/extractor/sohu.py b/yt_dlp/extractor/sohu.py index a8f1e4623e..c0ff4f9aa8 100644 --- a/yt_dlp/extractor/sohu.py +++ b/yt_dlp/extractor/sohu.py @@ -1,3 +1,4 @@ +import base64 import re from .common import InfoExtractor @@ -8,7 +9,12 @@ from ..utils import ( ExtractorError, int_or_none, + float_or_none, + url_or_none, + unified_timestamp, try_get, + urljoin, + traverse_obj, ) @@ -31,13 +37,20 @@ class SohuIE(InfoExtractor): 'id': '409385080', 'ext': 'mp4', 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', - } + }, + 'skip': 'no longer available', }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'info_dict': { 'id': '78693464', 'ext': 'mp4', 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + 'uploader': '爱范儿视频', + 'duration': 213, + 'timestamp': 1425519600, + 'upload_date': '20150305', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg', + 'tags': ['爱范儿', '爱范品', 'MWC', '手机'], } }, { 'note': 'Multipart video', @@ -45,6 +58,12 @@ class SohuIE(InfoExtractor): 'info_dict': { 'id': '78910339', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + 'uploader': '小苍cany', + 'duration': 744.0, + 'timestamp': 1426269360, + 'upload_date': '20150313', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg', + 'tags': ['小苍MM', '英雄联盟', '实战秘籍'], }, 'playlist': [{ 'info_dict': { @@ -75,6 +94,11 @@ class SohuIE(InfoExtractor): 'id': '78932792', 'ext': 'mp4', 'title': 'youtube-dl testing video', + 'duration': 360, + 'timestamp': 1426348620, + 'upload_date': '20150314', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg', + 'tags': [], }, 'params': { 'skip_download': True @@ -100,7 +124,7 @@ def _fetch_data(vid_id, mytv=False): webpage = self._download_webpage(url, video_id) - title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) + title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage)) vid = self._html_search_regex( r'var vid ?= ?["\'](\d+)["\']', @@ -132,7 +156,9 @@ def _fetch_data(vid_id, mytv=False): allot = format_data['allot'] data = format_data['data'] - clips_url = data['clipsURL'] + clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False) + if not clip_url: + raise ExtractorError(f'Unable to extract url for clip {i}') su = data['su'] video_url = 'newflv.sohu.ccgslb.net' @@ -142,9 +168,9 @@ def _fetch_data(vid_id, mytv=False): while 'newflv.sohu.ccgslb.net' in video_url: params = { 'prot': 9, - 'file': clips_url[i], + 'file': clip_url, 'new': su[i], - 'prod': 'flash', + 'prod': 'h5n', 'rb': 1, } @@ -193,6 +219,75 @@ def _fetch_data(vid_id, mytv=False): 'entries': playlist, 'id': video_id, 'title': title, + 'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})), } - return info + if mytv: + publish_time = unified_timestamp(self._search_regex( + r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False)) + else: + publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp})) + + return { + 'timestamp': publish_time - 8 * 3600 if publish_time else None, + **traverse_obj(vid_data, { + 'alt_title': ('data', 'subName', {str}), + 'uploader': ('wm_data', 'wm_username', {str}), + 'thumbnail': ('data', 'coverImg', {url_or_none}), + 'tags': ('data', 'tag', {str.split}), + }), + **info, + } + + +class SohuVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])' + + _TESTS = [{ + 'note': 'Multipart video', + 'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html', + 'info_dict': { + 'id': '601315192', + 'title': '《淬火丹心》第1集', + 'alt_title': '“点天灯”发生事故', + 'duration': 2701.692, + 'timestamp': 1686758040, + 'upload_date': '20230614', + 'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg', + }, + 'playlist_mincount': 9, + 'skip': 'Only available in China', + }, { + 'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + 'uploader': '爱范儿视频', + 'duration': 213, + 'timestamp': 1425519600, + 'upload_date': '20150305', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg', + 'tags': ['爱范儿', '爱范品', 'MWC', '手机'], + } + }, { + 'note': 'Multipart video', + 'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + 'uploader': '小苍cany', + 'duration': 744.0, + 'timestamp': 1426269360, + 'upload_date': '20150313', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg', + 'tags': ['小苍MM', '英雄联盟', '实战秘籍'], + }, + 'playlist_mincount': 3, + }] + + def _real_extract(self, url): + encoded_id = self._match_id(url) + path = base64.urlsafe_b64decode(encoded_id).decode() + subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv' + return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE) From 308936619c8a4f3a52d73c829c2006ff6c55fea2 Mon Sep 17 00:00:00 2001 From: fireattack <human.peng@gmail.com> Date: Sun, 17 Sep 2023 05:18:04 +0800 Subject: [PATCH 378/501] [ie/facebook] Improve format sorting (#8074) Authored by: fireattack --- yt_dlp/extractor/facebook.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index c30a6b06a0..50a750d3b1 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -505,7 +505,6 @@ def process_formats(info): # with non-browser User-Agent. for f in info['formats']: f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - info['_format_sort_fields'] = ('res', 'quality') def extract_relay_data(_filter): return self._parse_json(self._search_regex( @@ -552,7 +551,8 @@ def parse_graphql_video(video): else: formats.append({ 'format_id': format_id, - 'quality': q(format_id), + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, 'url': playable_url, }) extract_dash_manifest(video, formats) @@ -719,9 +719,11 @@ def parse_attachment(attachment, key='media'): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: - preference = -10 if format_id == 'progressive' else -1 + # sd, hd formats w/o resolution info should be deprioritized below DASH + # TODO: investigate if progressive or src formats still exist + preference = -10 if format_id == 'progressive' else -3 if quality == 'hd': - preference += 5 + preference += 1 formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, From 53675852195d8dd859555d4789944a6887171ff8 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 16:20:34 -0500 Subject: [PATCH 379/501] [ie/generic] Fix KVS thumbnail extraction Closes #8045 Authored by: bashonly --- yt_dlp/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f5c59a0930..33e71d1c57 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2370,7 +2370,7 @@ def _extract_kvs(self, url, webpage, video_id): 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': urljoin(url, thumbnail), 'formats': formats, } From 635ae31f68a3ac7f6393d59657ed711e34ee3552 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 16:22:21 -0500 Subject: [PATCH 380/501] [ie/mediastream] Make embed extraction non-fatal Authored by: bashonly --- yt_dlp/extractor/mediastream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index cef769f299..d5c9aab8a3 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -14,7 +14,7 @@ class MediaStreamBaseIE(InfoExtractor): _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' def _extract_mediastream_urls(self, webpage): - yield from traverse_obj(list(self._yield_json_ld(webpage, None)), ( + yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), ( lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) From 20c3c9b433dd47faf0dbde6b46e4e34eb76109a5 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 16:23:54 -0500 Subject: [PATCH 381/501] [ie/reddit] Extract subtitles Closes #7814 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 813e62874c..62f669f35d 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -319,16 +319,20 @@ def add_thumbnail(src): 'format_id': 'fallback', 'format_note': 'DASH video, mp4_dash', }] - formats.extend(self._extract_m3u8_formats( - hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_mpd_formats( - dash_playlist_url, display_id, mpd_id='dash', fatal=False)) + hls_fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles( + dash_playlist_url, display_id, mpd_id='dash', fatal=False) + formats.extend(dash_fmts) + self._merge_subtitles(dash_subs, target=subtitles) return { **info, 'id': video_id, 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, 'duration': int_or_none(reddit_video.get('duration')), } From eda0e415d26eb084e570cf5372d38ee1f616b70f Mon Sep 17 00:00:00 2001 From: garret <garret1317@yandex.com> Date: Sat, 16 Sep 2023 23:47:49 +0100 Subject: [PATCH 382/501] [ie/bbc] Extract tracklist as chapters (#7788) Authored by: garret1317 --- yt_dlp/extractor/bbc.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a55cdef2b8..d1d6e04faa 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -15,11 +15,13 @@ float_or_none, get_element_by_class, int_or_none, + join_nonempty, js_to_json, parse_duration, parse_iso8601, parse_qs, strip_or_none, + traverse_obj, try_get, unescapeHTML, unified_timestamp, @@ -41,7 +43,6 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| - sounds/play/| events/[^/]+/play/[^/]+/ ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) @@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - }, { - 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', - 'note': 'Audio', - 'info_dict': { - 'id': 'm0007jz9', - 'ext': 'mp4', - 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', - 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", - 'duration': 9840, - }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -844,6 +831,20 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'upload_date': '20190604', 'categories': ['Psychology'], }, + }, { + # BBC Sounds + 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', + 'info_dict': { + 'id': 'm001q789', + 'ext': 'mp4', + 'title': 'The Night Tracks Mix - Music for the darkling hour', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', + 'chapters': 'count:8', + 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', + 'uploader': 'Radio 3', + 'duration': 1800, + 'uploader_id': 'bbc_radio_three', + }, }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1128,6 +1129,13 @@ def _real_extract(self, url): 'uploader_id': network.get('id'), 'formats': formats, 'subtitles': subtitles, + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + })) or None, } bbc3_config = self._parse_json( From 2da7bcca16fdb40d4bdb2746643ba1a603771382 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 18:57:14 -0500 Subject: [PATCH 383/501] Revert 9d376c4daeaf1279a011582f3f0e6ae42af520dd Authored by: bashonly --- yt_dlp/extractor/amazonminitv.py | 63 +++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index ad23b16bd6..b57d985d10 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -37,7 +37,7 @@ def _call_api(self, asin, data=None, note=None): return resp['data'][data['operationName']] -class AmazonMiniTVIE(InfoExtractor): +class AmazonMiniTVIE(AmazonMiniTVBaseIE): _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P<id>[a-f0-9-]+)' _TESTS = [{ 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', @@ -86,14 +86,56 @@ class AmazonMiniTVIE(InfoExtractor): 'only_matching': True, }] + _GRAPHQL_QUERY_CONTENT = ''' +query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { + content( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + contentId: $contentId + contentType: $contentType + ) { + contentId + name + ... on Episode { + contentId + vodType + name + images + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + audioTracks + seasonId + seriesId + seriesName + seasonNumber + episodeNumber + timecode { + endCreditsTime + } + } + ... on MovieContent { + contentId + vodType + name + description { + synopsis + contentLengthInSeconds + } + images + publicReleaseDateUTC + audioTracks + } + } +}''' + def _real_extract(self, url): - video_uuid = self._match_id(url) - asin = f'amzn1.dv.gti.{video_uuid}' - webpage = self._download_webpage(f'https://www.amazon.in/minitv/tp/{video_uuid}', asin) - data = self._search_nextjs_data(webpage, asin)['props']['pageProps']['ssrProps'] + asin = f'amzn1.dv.gti.{self._match_id(url)}' + prs = self._call_api(asin, note='Downloading playback info') formats, subtitles = [], {} - for type_, asset in traverse_obj(data, ('playbackData', 'playbackAssets', {dict.items}, ...)): + for type_, asset in prs['playbackAssets'].items(): if not traverse_obj(asset, 'manifestUrl'): continue if type_ == 'hls': @@ -110,7 +152,12 @@ def _real_extract(self, url): else: self.report_warning(f'Unknown asset type: {type_}') - title_info = traverse_obj(data, ('contentData', {dict})) or {} + title_info = self._call_api( + asin, note='Downloading title info', data={ + 'operationName': 'content', + 'variables': {'contentId': asin}, + 'query': self._GRAPHQL_QUERY_CONTENT, + }) credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) is_episode = title_info.get('vodType') == 'EPISODE' @@ -145,7 +192,6 @@ class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' - _WORKING = False _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -205,7 +251,6 @@ class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' - _WORKING = False _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, From 538d37671a17e0782d17f08df17800e2e3bd57c8 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 19:03:30 -0500 Subject: [PATCH 384/501] [ie/AmazonMiniTV] Fix extractors Closes #7817 Authored by: GautamMKGarg, bashonly Co-authored by: GautamMKGarg <GautamMKgarg@gmail.com> --- yt_dlp/extractor/amazonminitv.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index b57d985d10..2c71c5ef56 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -22,8 +22,11 @@ def _call_api(self, asin, data=None, note=None): resp = self._download_json( f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', - asin, note=note, headers={'Content-Type': 'application/json'}, - data=json.dumps(data).encode() if data else None, + asin, note=note, headers={ + 'Content-Type': 'application/json', + 'currentpageurl': '/', + 'currentplatform': 'dWeb' + }, data=json.dumps(data).encode() if data else None, query=None if data else { 'deviceType': 'A1WMMUXPCUJL4N', 'contentId': asin, @@ -46,7 +49,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE): 'ext': 'mp4', 'title': 'May I Kiss You?', 'language': 'Hindi', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'description': 'md5:a549bfc747973e04feb707833474e59d', 'release_timestamp': 1644710400, 'release_date': '20220213', @@ -68,7 +71,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE): 'ext': 'mp4', 'title': 'Jahaan', 'language': 'Hindi', - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'description': 'md5:05eb765a77bf703f322f120ec6867339', 'release_timestamp': 1647475200, 'release_date': '20220317', From 9652bca1bd02f6bc1b8cb1e186f2ccbf32225561 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 16 Sep 2023 19:38:09 -0500 Subject: [PATCH 385/501] [ie/web.archive:vlive] Remove extractor (#8132) Closes #8122 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/archiveorg.py | 235 -------------------------------- yt_dlp/extractor/naver.py | 2 +- 3 files changed, 1 insertion(+), 237 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4fed6d66a2..bf0c67542e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -122,7 +122,6 @@ from .archiveorg import ( ArchiveOrgIE, YoutubeWebArchiveIE, - VLiveWebArchiveIE, ) from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 2541cd6fd8..a0b26ac5a0 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -3,7 +3,6 @@ import urllib.parse from .common import InfoExtractor -from .naver import NaverBaseIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE from ..compat import compat_urllib_parse_unquote from ..networking import HEADRequest @@ -947,237 +946,3 @@ def _real_extract(self, url): if not info.get('title'): info['title'] = video_id return info - - -class VLiveWebArchiveIE(InfoExtractor): - IE_NAME = 'web.archive:vlive' - IE_DESC = 'web.archive.org saved vlive videos' - _VALID_URL = r'''(?x) - (?:https?://)?web\.archive\.org/ - (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional - (?:https?(?::|%3[Aa])//)?(?: - (?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL - ) - ''' - _TESTS = [{ - 'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326', - 'md5': 'cc7314812855ce56de70a06a27314983', - 'info_dict': { - 'id': '1326', - 'ext': 'mp4', - 'title': "Girl's Day's Broadcast", - 'creator': "Girl's Day", - 'view_count': int, - 'uploader_id': 'muploader_a', - 'uploader_url': None, - 'uploader': None, - 'upload_date': '20150817', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1439816449, - 'like_count': int, - 'channel': 'Girl\'s Day', - 'channel_id': 'FDF27', - 'comment_count': int, - 'release_timestamp': 1439818140, - 'release_date': '20150817', - 'duration': 1014, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937', - 'info_dict': { - 'id': '16937', - 'ext': 'mp4', - 'title': '첸백시 걍방', - 'creator': 'EXO', - 'view_count': int, - 'subtitles': 'mincount:12', - 'uploader_id': 'muploader_j', - 'uploader_url': 'http://vlive.tv', - 'uploader': None, - 'upload_date': '20161112', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1478923074, - 'like_count': int, - 'channel': 'EXO', - 'channel_id': 'F94BD', - 'comment_count': int, - 'release_timestamp': 1478924280, - 'release_date': '20161112', - 'duration': 906, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870', - 'info_dict': { - 'id': '101870', - 'ext': 'mp4', - 'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)', - 'creator': 'Dispatch', - 'view_count': int, - 'subtitles': 'mincount:6', - 'uploader_id': 'V__FRA08071', - 'uploader_url': 'http://vlive.tv', - 'uploader': None, - 'upload_date': '20181130', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1543601327, - 'like_count': int, - 'channel': 'Dispatch', - 'channel_id': 'C796F3', - 'comment_count': int, - 'release_timestamp': 1543601040, - 'release_date': '20181130', - 'duration': 279, - }, - 'params': { - 'skip_download': True, - }, - }] - - # The wayback machine has special timestamp and "mode" values: - # timestamp: - # 1 = the first capture - # 2 = the last capture - # mode: - # id_ = Identity - perform no alterations of the original resource, return it as it was archived. - _WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/' - - def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs): - for retry in self.RetryManager(): - try: - return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 404: - raise ExtractorError('Page was not archived', expected=True) - retry.error = e - continue - - def _download_archived_json(self, url, video_id, **kwargs): - page = self._download_archived_page(url, video_id, **kwargs) - if not page: - raise ExtractorError('Page was not archived', expected=True) - else: - return self._parse_json(page, video_id) - - def _extract_formats_from_m3u8(self, m3u8_url, params, video_id): - m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False) - if not m3u8_doc: - return - - # M3U8 document should be changed to archive domain - m3u8_doc = m3u8_doc.splitlines() - url_base = m3u8_url.rsplit('/', 1)[0] - first_segment = None - for i, line in enumerate(m3u8_doc): - if not line.startswith('#'): - m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}' - first_segment = first_segment or m3u8_doc[i] - - # Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870 - urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False, - fatal=False, note='Check first segment availablity') - if urlh: - formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id) - if subtitles: - self._report_ignoring_subs('m3u8') - return formats - - # Closely follows the logic of the ArchiveTeam grab script - # See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua - def _real_extract(self, url): - video_id, url_date = self._match_valid_url(url).group('id', 'date') - - webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date) - - player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id) - user_country = traverse_obj(player_info, ('common', 'userCountry')) - - main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url') - main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script') - app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id') - - inkey = self._download_archived_json( - f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={ - 'appId': app_id, - 'platformType': 'PC', - 'gcc': user_country, - 'locale': 'en_US', - }, fatal=False) - - vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId')) - - vod_data = self._download_archived_json( - f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={ - 'key': inkey.get('inkey'), - 'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project - 'sid': '2024', - 'ver': '2.0', - 'devt': 'html5_pc', - 'doct': 'json', - 'ptc': 'https', - 'sptc': 'https', - 'cpt': 'vtt', - 'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D', - 'pv': '4.26.9', - 'dr': '1920x1080', - 'cpl': 'en_US', - 'lc': 'en_US', - 'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D', - 'adu': '%2F', - 'videoId': vod_id, - 'cc': user_country, - }) - - formats = [] - - streams = traverse_obj(vod_data, ('streams', ...)) - if len(streams) > 1: - self.report_warning('Multiple streams found. Only the first stream will be downloaded.') - stream = streams[0] - - max_stream = max( - stream.get('videos') or [], - key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) - if max_stream is not None: - params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'} - formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or [] - - # For parts of the project MP4 files were archived - max_video = max( - traverse_obj(vod_data, ('videos', 'list', ...)), - key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) - if max_video is not None: - video_url = self._WAYBACK_BASE_URL + max_video.get('source') - urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False, - fatal=False, note='Check video availablity') - if urlh: - formats.append({'url': video_url}) - - return { - 'id': video_id, - 'formats': formats, - **traverse_obj(player_info, ('postDetail', 'post', { - 'title': ('officialVideo', 'title', {str}), - 'creator': ('author', 'nickname', {str}), - 'channel': ('channel', 'channelName', {str}), - 'channel_id': ('channel', 'channelCode', {str}), - 'duration': ('officialVideo', 'playTime', {int_or_none}), - 'view_count': ('officialVideo', 'playCount', {int_or_none}), - 'like_count': ('officialVideo', 'likeCount', {int_or_none}), - 'comment_count': ('officialVideo', 'commentCount', {int_or_none}), - 'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}), - 'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}), - })), - **traverse_obj(vod_data, ('meta', { - 'uploader_id': ('user', 'id', {str}), - 'uploader': ('user', 'name', {str}), - 'uploader_url': ('user', 'url', {url_or_none}), - 'thumbnail': ('cover', 'source', {url_or_none}), - }), expected_type=lambda x: x or None), - **NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]), - } diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index d79caf5f3d..2d8459b02b 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -21,7 +21,7 @@ class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE + @staticmethod # NB: Used in WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)): From 94389b225d9bcf29aa7ba8afaf1bbd7c62204eae Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 16 Sep 2023 21:42:42 -0500 Subject: [PATCH 386/501] [ie/RTVSLO] Fix format extraction (#8131) Closes #8020 Authored by: bashonly --- yt_dlp/extractor/rtvslo.py | 50 +++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/rtvslo.py b/yt_dlp/extractor/rtvslo.py index 05942b6b44..39ace7cc6e 100644 --- a/yt_dlp/extractor/rtvslo.py +++ b/yt_dlp/extractor/rtvslo.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, parse_duration, traverse_obj, unified_timestamp, @@ -25,7 +26,7 @@ class RTVSLOIE(InfoExtractor): 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', 'info_dict': { 'id': '174842550', - 'ext': 'flv', + 'ext': 'mp4', 'release_timestamp': 1643140032, 'upload_date': '20220125', 'series': 'Dnevnik', @@ -69,7 +70,21 @@ class RTVSLOIE(InfoExtractor): 'tbr': 128000, 'release_date': '20220201', }, - + }, { + 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', + 'info_dict': { + 'id': '148350750', + 'ext': 'mp4', + 'title': 'Prvi šolski dan, mozaična oddaja za mlade', + 'series': 'Razred zase', + 'series_id': '148185730', + 'duration': 1481, + 'upload_date': '20121019', + 'timestamp': 1350672122, + 'release_date': '20121019', + 'release_timestamp': 1350672122, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', + }, }, { 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', 'only_matching': True @@ -98,13 +113,14 @@ def _real_extract(self, url): media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response'] formats = [] + skip_protocols = ['smil', 'f4m', 'dash'] adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none) if adaptive_url: - formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']) + formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols) adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none) if adaptive_url: - for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']): + for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols): formats.append({ **f, 'format_id': 'sign-' + f['format_id'], @@ -114,19 +130,19 @@ def _real_extract(self, url): else f.get('language')) }) - formats.extend( - { - 'url': f['streams'][strm], - 'ext': traverse_obj(f, 'mediaType', expected_type=str.lower), - 'width': f.get('width'), - 'height': f.get('height'), - 'tbr': f.get('bitrate'), - 'filesize': f.get('filesize'), - } - for strm in ('http', 'https') - for f in media.get('mediaFiles') or [] - if traverse_obj(f, ('streams', strm)) - ) + for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['https']))): + formats.append(traverse_obj(mediafile, { + 'url': ('streams', 'https'), + 'ext': ('mediaType', {str.lower}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('filesize', {int_or_none}), + })) + + for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['hls_sec']))): + formats.extend(self._extract_wowza_formats( + mediafile['streams']['hls_sec'], v_id, skip_protocols=skip_protocols)) if any('intermission.mp4' in x['url'] for x in formats): self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) From 836e06d246512f286f30c1371b2c54b72c9ecd93 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sun, 17 Sep 2023 12:56:50 +0200 Subject: [PATCH 387/501] [core] Fix support for upcoming Python 3.12 (#8130) This also adds the following test runners: - `3.12-dev` on `ubuntu-latest` - `3.12-dev` on `windows-latest` - `pypy-3.10` on `ubuntu-latest` Authored by: Grub4K --- .github/workflows/core.yml | 5 ++++- devscripts/update-version.py | 4 ++-- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/aws.py | 2 +- yt_dlp/extractor/goplay.py | 4 ++-- yt_dlp/extractor/motherless.py | 2 +- yt_dlp/extractor/panopto.py | 4 ++-- yt_dlp/networking/_urllib.py | 2 +- yt_dlp/networking/exceptions.py | 2 +- yt_dlp/utils/_utils.py | 12 ++++++++---- 10 files changed, 23 insertions(+), 16 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index dead444c0b..689408c500 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -13,13 +13,16 @@ jobs: matrix: os: [ubuntu-latest] # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', pypy-3.7, pypy-3.8] + python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest python-version: '3.7' run-tests-ext: bat + - os: windows-latest + python-version: '3.12-dev' + run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 run-tests-ext: bat diff --git a/devscripts/update-version.py b/devscripts/update-version.py index c873d10a5d..0144bd284a 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -10,14 +10,14 @@ import argparse import contextlib import sys -from datetime import datetime +from datetime import datetime, timezone from devscripts.utils import read_version, run_process, write_file def get_new_version(version, revision): if not version: - version = datetime.utcnow().strftime('%Y.%m.%d') + version = datetime.now(timezone.utc).strftime('%Y.%m.%d') if revision: assert revision.isdigit(), 'Revision must be a number' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 666d89b461..1feed30524 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2591,7 +2591,7 @@ def _fill_common_fields(self, info_dict, final=True): # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) with contextlib.suppress(ValueError, OverflowError, OSError): - upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc) info_dict[date_key] = upload_date.strftime('%Y%m%d') live_keys = ('is_live', 'was_live') diff --git a/yt_dlp/extractor/aws.py b/yt_dlp/extractor/aws.py index eb831a1530..c4741a6a11 100644 --- a/yt_dlp/extractor/aws.py +++ b/yt_dlp/extractor/aws.py @@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _aws_execute_api(self, aws_dict, video_id, query=None): query = query or {} - amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ') date = amz_date[:8] headers = { 'Accept': 'application/json', diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index 960d7d7bc0..0a3c8340f1 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -383,9 +383,9 @@ def __get_current_timestamp(): months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - time_now = datetime.datetime.utcnow() + time_now = datetime.datetime.now(datetime.timezone.utc) format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) - time_string = datetime.datetime.utcnow().strftime(format_string) + time_string = time_now.strftime(format_string) return time_string def __str__(self): diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index 769b52ce6d..e359c44e93 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -151,7 +151,7 @@ def _real_extract(self, url): 'd': 'days', } kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} - upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') + upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 6e3c9f442d..5ab2b2bcec 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -1,7 +1,7 @@ import calendar import json import functools -from datetime import datetime +from datetime import datetime, timezone from random import random from .common import InfoExtractor @@ -243,7 +243,7 @@ def _mark_watched(self, base_url, video_id, delivery_info): invocation_id = delivery_info.get('InvocationId') stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) if invocation_id and stream_id and duration: - timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' + timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/' data = { 'streamRequests': [ { diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index b3e705b844..3c0647ecf9 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -429,7 +429,7 @@ def _send(self, request): except urllib.error.HTTPError as e: if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): # Prevent file object from being closed when urllib.error.HTTPError is destroyed. - e._closer.file = None + e._closer.close_called = True raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e raise # unexpected except urllib.error.URLError as e: diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 10afc9ccbf..465b18ba94 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -115,7 +115,7 @@ def __init__(self, http_error: HTTPError): hdrs=http_error.response.headers, fp=http_error.response ) - self._closer.file = None # Disable auto close + self._closer.close_called = True # Disable auto close self._http_error = http_error HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 180bec245a..ef26de1160 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -669,6 +669,7 @@ def replace_insane(char): def sanitize_path(s, force=False): """Sanitizes and normalizes path on Windows""" + # XXX: this handles drive relative paths (c:sth) incorrectly if sys.platform == 'win32': force = False drive_or_unc, _ = os.path.splitdrive(s) @@ -687,7 +688,10 @@ def sanitize_path(s, force=False): sanitized_path.insert(0, drive_or_unc + os.path.sep) elif force and s and s[0] == os.path.sep: sanitized_path.insert(0, os.path.sep) - return os.path.join(*sanitized_path) + # TODO: Fix behavioral differences <3.12 + # The workaround using `normpath` only superficially passes tests + # Ref: https://github.com/python/cpython/pull/100351 + return os.path.normpath(os.path.join(*sanitized_path)) def sanitize_url(url, *, scheme='http'): @@ -1256,7 +1260,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if precision == 'auto': auto_precision = True precision = 'microsecond' - today = datetime_round(datetime.datetime.utcnow(), precision) + today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': @@ -1319,8 +1323,8 @@ def datetime_round(dt, precision='day'): 'second': 1, } roundto = lambda x, n: ((x + n / 2) // n) * n - timestamp = calendar.timegm(dt.timetuple()) - return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision])) + timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision]) + return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) def hyphenate_date(date_str): From 30ba233d4cee945756ed7344e7ddb3a90d2ae608 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sun, 17 Sep 2023 13:22:04 +0200 Subject: [PATCH 388/501] [devscripts] `make_changelog`: Fix changelog grouping and add networking group (#8124) Authored by: Grub4K --- devscripts/changelog_override.json | 21 ++++++- devscripts/make_changelog.py | 96 ++++++++++++++++-------------- 2 files changed, 71 insertions(+), 46 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index d03db3f232..e7f453acf8 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -68,6 +68,25 @@ { "action": "change", "when": "b03fa7834579a01cc5fba48c0e73488a16683d48", - "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b" + "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b", + "authors": ["pukkandan"] + }, + { + "action": "change", + "when": "fcd6a76adc49d5cd8783985c7ce35384b72e545f", + "short": "[test] Add tests for socks proxies (#7908)", + "authors": ["coletdjnz"] + }, + { + "action": "change", + "when": "4bf912282a34b58b6b35d8f7e6be535770c89c76", + "short": "[rh:urllib] Remove dot segments during URL normalization (#7662)", + "authors": ["coletdjnz"] + }, + { + "action": "change", + "when": "59e92b1f1833440bb2190f847eb735cf0f90bc85", + "short": "[rh:urllib] Simplify gzip decoding (#7611)", + "authors": ["Grub4K"] } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 84f72d52f3..ac68dcd19a 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -31,35 +31,27 @@ class CommitGroup(enum.Enum): EXTRACTOR = 'Extractor' DOWNLOADER = 'Downloader' POSTPROCESSOR = 'Postprocessor' + NETWORKING = 'Networking' MISC = 'Misc.' - @classmethod - @property - def ignorable_prefixes(cls): - return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream') - @classmethod @lru_cache - def commit_lookup(cls): + def subgroup_lookup(cls): return { name: group for group, names in { - cls.PRIORITY: {'priority'}, cls.CORE: { 'aes', 'cache', 'compat_utils', 'compat', 'cookies', - 'core', 'dependencies', 'formats', 'jsinterp', - 'networking', 'outtmpl', 'plugins', 'update', - 'upstream', 'utils', }, cls.MISC: { @@ -67,23 +59,40 @@ def commit_lookup(cls): 'cleanup', 'devscripts', 'docs', - 'misc', 'test', }, - cls.EXTRACTOR: {'extractor', 'ie'}, - cls.DOWNLOADER: {'downloader', 'fd'}, - cls.POSTPROCESSOR: {'postprocessor', 'pp'}, + cls.NETWORKING: { + 'rh', + }, }.items() for name in names } @classmethod - def get(cls, value): - result = cls.commit_lookup().get(value) - if result: - logger.debug(f'Mapped {value!r} => {result.name}') + @lru_cache + def group_lookup(cls): + result = { + 'fd': cls.DOWNLOADER, + 'ie': cls.EXTRACTOR, + 'pp': cls.POSTPROCESSOR, + 'upstream': cls.CORE, + } + result.update({item.name.lower(): item for item in iter(cls)}) return result + @classmethod + def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: + group, _, subgroup = (group.strip().lower() for group in value.partition('/')) + + result = cls.group_lookup().get(group) + if not result: + if subgroup: + return None, value + subgroup = group + result = cls.subgroup_lookup().get(subgroup) + + return result, subgroup or None + @dataclass class Commit: @@ -198,19 +207,23 @@ def _prepare_cleanup_misc_items(self, items): for commit_infos in cleanup_misc_items.values(): sorted_items.append(CommitInfo( 'cleanup', ('Miscellaneous',), ', '.join( - self._format_message_link(None, info.commit.hash).strip() + self._format_message_link(None, info.commit.hash) for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), [], Commit(None, '', commit_infos[0].commit.authors), [])) return sorted_items - def format_single_change(self, info): - message = self._format_message_link(info.message, info.commit.hash) + def format_single_change(self, info: CommitInfo): + message, sep, rest = info.message.partition('\n') + if '[' not in message: + # If the message doesn't already contain markdown links, try to add a link to the commit + message = self._format_message_link(message, info.commit.hash) + if info.issues: - message = message.replace('\n', f' ({self._format_issues(info.issues)})\n', 1) + message = f'{message} ({self._format_issues(info.issues)})' if info.commit.authors: - message = message.replace('\n', f' by {self._format_authors(info.commit.authors)}\n', 1) + message = f'{message} by {self._format_authors(info.commit.authors)}' if info.fixes: fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes) @@ -219,16 +232,14 @@ def format_single_change(self, info): if authors != info.commit.authors: fix_message = f'{fix_message} by {self._format_authors(authors)}' - message = message.replace('\n', f' (With fixes in {fix_message})\n', 1) + message = f'{message} (With fixes in {fix_message})' - return message[:-1] + return message if not sep else f'{message}{sep}{rest}' def _format_message_link(self, message, hash): assert message or hash, 'Improperly defined commit message or override' message = message if message else hash[:HASH_LENGTH] - if not hash: - return f'{message}\n' - return f'[{message}\n'.replace('\n', f']({self.repo_url}/commit/{hash})\n', 1) + return f'[{message}]({self.repo_url}/commit/{hash})' if hash else message def _format_issues(self, issues): return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues) @@ -318,7 +329,7 @@ def _get_commits_and_fixes(self, default_author): for commitish, revert_commit in reverts.items(): reverted = commits.pop(commitish, None) if reverted: - logger.debug(f'{commit} fully reverted {reverted}') + logger.debug(f'{commitish} fully reverted {reverted}') else: commits[revert_commit.hash] = revert_commit @@ -337,7 +348,7 @@ def apply_overrides(self, overrides): for override in overrides: when = override.get('when') if when and when not in self and when != self._start: - logger.debug(f'Ignored {when!r}, not in commits {self._start!r}') + logger.debug(f'Ignored {when!r} override') continue override_hash = override.get('hash') or when @@ -365,7 +376,7 @@ def groups(self): for commit in self: upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) if upstream_re: - commit.short = f'[core/upstream] Merged with youtube-dl {upstream_re.group(1)}' + commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}' match = self.MESSAGE_RE.fullmatch(commit.short) if not match: @@ -410,25 +421,20 @@ def details_from_prefix(prefix): if not prefix: return CommitGroup.CORE, None, () - prefix, _, details = prefix.partition('/') - prefix = prefix.strip() - details = details.strip() + prefix, *sub_details = prefix.split(':') - group = CommitGroup.get(prefix.lower()) - if group is CommitGroup.PRIORITY: - prefix, _, details = details.partition('/') + group, details = CommitGroup.get(prefix) + if group is CommitGroup.PRIORITY and details: + details = details.partition('/')[2].strip() - if not details and prefix and prefix not in CommitGroup.ignorable_prefixes: - logger.debug(f'Replaced details with {prefix!r}') - details = prefix or None + if details and '/' in details: + logger.error(f'Prefix is overnested, using first part: {prefix}') + details = details.partition('/')[0].strip() if details == 'common': details = None - - if details: - details, *sub_details = details.split(':') - else: - sub_details = [] + elif group is CommitGroup.NETWORKING and details == 'rh': + details = 'Request Handler' return group, details, sub_details From 58493923e9b6f774947a2131e5258e9f3cf816be Mon Sep 17 00:00:00 2001 From: soundchaser128 <69268557+soundchaser128@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:09:42 +0200 Subject: [PATCH 389/501] [ie/rule34video] Extract tags (#7117) Authored by: soundchaser128 --- yt_dlp/extractor/rule34video.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index 9d15f4d214..f3250b557a 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -1,6 +1,6 @@ import re -from ..utils import parse_duration +from ..utils import parse_duration, unescapeHTML from .common import InfoExtractor @@ -16,7 +16,8 @@ class Rule34VideoIE(InfoExtractor): 'title': 'Shot It-(mmd hmv)', 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg', 'duration': 347.0, - 'age_limit': 18 + 'age_limit': 18, + 'tags': 'count:14' } }, { @@ -28,7 +29,8 @@ class Rule34VideoIE(InfoExtractor): 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]', 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg', 'duration': 938.0, - 'age_limit': 18 + 'age_limit': 18, + 'tags': 'count:50' } }, ] @@ -57,5 +59,7 @@ def _real_extract(self, url): 'title': title, 'thumbnail': thumbnail, 'duration': parse_duration(duration), - 'age_limit': 18 + 'age_limit': 18, + 'tags': list(map(unescapeHTML, re.findall( + r'<a class="tag_item"[^>]+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P<tag>[^>]*)</a>', webpage))), } From efa2339502a37cf13ae7f143bd8b2c28f452d1cd Mon Sep 17 00:00:00 2001 From: Simon <simon30002021@icloud.com> Date: Sun, 17 Sep 2023 17:11:22 +0200 Subject: [PATCH 390/501] [ie/lecturio] Improve `_VALID_URL` (#7649) Authored by: simon300000 --- yt_dlp/extractor/lecturio.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/lecturio.py b/yt_dlp/extractor/lecturio.py index bb059d3a29..795012541c 100644 --- a/yt_dlp/extractor/lecturio.py +++ b/yt_dlp/extractor/lecturio.py @@ -57,8 +57,8 @@ class LecturioIE(LecturioBaseIE): _VALID_URL = r'''(?x) https:// (?: - app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| - (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag + app\.lecturio\.com/([^/?#]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| + (?:www\.)?lecturio\.de/(?:[^/?#]+/)+(?P<nt_de>[^/?#&]+)\.vortrag ) ''' _TESTS = [{ @@ -73,6 +73,9 @@ class LecturioIE(LecturioBaseIE): }, { 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', 'only_matching': True, + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-at-1-staatsexamen/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, }, { 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', 'only_matching': True, From 63e0c5748c0eb461a2ccca4181616eb930b4b750 Mon Sep 17 00:00:00 2001 From: aky-01 <65510015+aky-01@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:16:11 +0200 Subject: [PATCH 391/501] [ie/IndavideoEmbed] Fix extraction (#8129) Closes #7190 Authored by: aky-01 --- yt_dlp/extractor/indavideo.py | 73 +++++++++++++++++------------------ 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/indavideo.py b/yt_dlp/extractor/indavideo.py index 4fa97d8bba..564bf8a024 100644 --- a/yt_dlp/extractor/indavideo.py +++ b/yt_dlp/extractor/indavideo.py @@ -1,9 +1,9 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + time_seconds, update_url_query, ) @@ -11,15 +11,14 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] + # https://indavideo.hu/video/Vicces_cica_1 + # https://index.indavideo.hu/video/Hod_Nemetorszagban + # https://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # https://film.indavideo.hu/video/f_farkaslesen + # https://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ - 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', + 'url': 'https://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', @@ -36,21 +35,33 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], }, }, { - 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', - 'only_matching': True, - }, { - 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1', + 'url': 'https://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://indavideo.hu/video/Vicces_cica_1', + 'info_dict': { + 'id': '1335611', + 'ext': 'mp4', + 'title': 'Vicces cica', + 'description': 'Játszik a tablettel. :D', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Jet_Pack', + 'uploader_id': '491217', + 'timestamp': 1390821212, + 'upload_date': '20140127', + 'duration': 7, + 'age_limit': 0, + 'tags': ['cica', 'Jet_Pack'], + }, + }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, - video_id)['data'] - - title = video['title'] + f'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/{video_id}/', + video_id, query={'_': time_seconds()})['data'] video_urls = [] @@ -60,33 +71,21 @@ def _real_extract(self, url): elif isinstance(video_files, dict): video_urls.extend(video_files.values()) - video_file = video.get('video_file') - if video: - video_urls.append(video_file) video_urls = list(set(video_urls)) - video_prefix = video_urls[0].rsplit('/', 1)[0] - - for flv_file in video.get('flv_files', []): - flv_url = '%s/%s' % (video_prefix, flv_file) - if flv_url not in video_urls: - video_urls.append(flv_url) - - filesh = video.get('filesh') + filesh = video.get('filesh') or {} formats = [] for video_url in video_urls: height = int_or_none(self._search_regex( r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) - if filesh: - if not height: - continue - token = filesh.get(compat_str(height)) - if token is None: - continue - video_url = update_url_query(video_url, {'token': token}) + if not height and len(filesh) == 1: + height = int_or_none(list(filesh.keys())[0]) + token = filesh.get(str(height)) + if token is None: + continue formats.append({ - 'url': video_url, + 'url': update_url_query(video_url, {'token': token}), 'height': height, }) @@ -103,7 +102,7 @@ def _real_extract(self, url): return { 'id': video.get('id') or video_id, - 'title': title, + 'title': video.get('title'), 'description': video.get('description'), 'thumbnails': thumbnails, 'uploader': video.get('user_name'), From 81f46ac573dc443ad48560f308582a26784d3015 Mon Sep 17 00:00:00 2001 From: Sebastian Koch <sebastian@0py.de> Date: Sun, 17 Sep 2023 22:54:00 +0200 Subject: [PATCH 392/501] [ie/massengeschmack.tv] Fix title extraction (#7813) Authored by: sb0stn --- yt_dlp/extractor/massengeschmacktv.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/massengeschmacktv.py b/yt_dlp/extractor/massengeschmacktv.py index 7dacb43e02..1490e9b21d 100644 --- a/yt_dlp/extractor/massengeschmacktv.py +++ b/yt_dlp/extractor/massengeschmacktv.py @@ -17,11 +17,12 @@ class MassengeschmackTVIE(InfoExtractor): _TEST = { 'url': 'https://massengeschmack.tv/play/fktv202', - 'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', + 'md5': '9996f314994a49fefe5f39aa1b07ae21', 'info_dict': { 'id': 'fktv202', 'ext': 'mp4', - 'title': 'Fernsehkritik-TV - Folge 202', + 'title': 'Fernsehkritik-TV #202', + 'thumbnail': 'https://cache.massengeschmack.tv/img/mag/fktv202.jpg' }, } @@ -29,9 +30,6 @@ def _real_extract(self, url): episode = self._match_id(url) webpage = self._download_webpage(url, episode) - title = clean_html(self._html_search_regex( - '<h3>([^<]+)</h3>', webpage, 'title')) - thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) formats = [] @@ -67,7 +65,8 @@ def _real_extract(self, url): return { 'id': episode, - 'title': title, + 'title': clean_html(self._html_search_regex( + r'<span[^>]+\bid=["\']clip-title["\'][^>]*>([^<]+)', webpage, 'title', fatal=False)), 'formats': formats, - 'thumbnail': thumbnail, + 'thumbnail': self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False), } From 20fbbd9249a2f26c7ae579bde5ba5d69aa8fac69 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Mon, 18 Sep 2023 07:33:26 +0000 Subject: [PATCH 393/501] [networking] Fix various socks proxy bugs (#8065) - Fixed support for IPv6 socks proxies - Fixed support for IPv6 over socks5 - Fixed --source-address not being obeyed for socks4 and socks5 - Fixed socks4a when the destination address is an IPv4 address Closes https://github.com/yt-dlp/yt-dlp/issues/7959 Fixes https://github.com/ytdl-org/youtube-dl/issues/15368 Authored by: coletdjnz Co-authored-by: Simon Sawicki <accounts@grub4k.xyz> Co-authored-by: bashonly <bashonly@bashonly.com> --- test/test_socks.py | 38 +++++--------------- yt_dlp/networking/_helper.py | 57 ++++++++++++++++++++++++++++++ yt_dlp/networking/_urllib.py | 68 +++++++++++++----------------------- yt_dlp/socks.py | 31 +++++++++------- 4 files changed, 110 insertions(+), 84 deletions(-) diff --git a/test/test_socks.py b/test/test_socks.py index 95ffce275b..211ee814d1 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -281,17 +281,13 @@ def test_socks4_auth(self, handler, ctx): rh, proxies={'all': f'socks4://user:@{server_address}'}) assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='socks4a implementation currently broken when destination is not a domain name')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_socks4a_ipv4_target(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: response = ctx.socks_info_request(rh, target_domain='127.0.0.1') assert response['version'] == 4 - assert response['ipv4_address'] == '127.0.0.1' - assert response['domain_address'] is None + assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1') @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_socks4a_domain_target(self, handler, ctx): @@ -302,10 +298,7 @@ def test_socks4a_domain_target(self, handler, ctx): assert response['ipv4_address'] is None assert response['domain_address'] == 'localhost' - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='source_address is not yet supported for socks4 proxies')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' @@ -327,10 +320,7 @@ def test_socks4_errors(self, handler, ctx, reply_code): with pytest.raises(ProxyError): ctx.socks_info_request(rh) - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='IPv6 socks4 proxies are not yet supported')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv6_socks4_proxy(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks4://{server_address}'}) as rh: @@ -342,7 +332,7 @@ def test_ipv6_socks4_proxy(self, handler, ctx): @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_timeout(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: - with handler(proxies={'all': f'socks4://{server_address}'}, timeout=1) as rh: + with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh: with pytest.raises(TransportError): ctx.socks_info_request(rh) @@ -383,7 +373,7 @@ def test_socks5_domain_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: response = ctx.socks_info_request(rh, target_domain='localhost') - assert response['ipv4_address'] == '127.0.0.1' + assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1') assert response['version'] == 5 @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) @@ -404,22 +394,15 @@ def test_socks5h_ip_target(self, handler, ctx): assert response['domain_address'] is None assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='IPv6 destination addresses are not yet supported')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_socks5_ipv6_destination(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: response = ctx.socks_info_request(rh, target_domain='[::1]') assert response['ipv6_address'] == '::1' - assert response['port'] == 80 assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='IPv6 socks5 proxies are not yet supported')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv6_socks5_proxy(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -430,10 +413,7 @@ def test_ipv6_socks5_proxy(self, handler, ctx): # XXX: is there any feasible way of testing IPv6 source addresses? # Same would go for non-proxy source_address test... - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='source_address is not yet supported for socks5 proxies')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index a43c57bb4b..4c9dbf25dc 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -2,6 +2,7 @@ import contextlib import functools +import socket import ssl import sys import typing @@ -206,3 +207,59 @@ def wrapper(self, *args, **kwargs): e.handler = self raise return wrapper + + +def _socket_connect(ip_addr, timeout, source_address): + af, socktype, proto, canonname, sa = ip_addr + sock = socket.socket(af, socktype, proto) + try: + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(sa) + return sock + except socket.error: + sock.close() + raise + + +def create_connection( + address, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + source_address=None, + *, + _create_socket_func=_socket_connect +): + # Work around socket.create_connection() which tries all addresses from getaddrinfo() including IPv6. + # This filters the addresses based on the given source_address. + # Based on: https://github.com/python/cpython/blob/main/Lib/socket.py#L810 + host, port = address + ip_addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + if not ip_addrs: + raise socket.error('getaddrinfo returns an empty list') + if source_address is not None: + af = socket.AF_INET if ':' not in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in ip_addrs if addr[0] == af] + if not ip_addrs: + raise OSError( + f'No remote IPv{4 if af == socket.AF_INET else 6} addresses available for connect. ' + f'Can\'t use "{source_address[0]}" as source address') + + err = None + for ip_addr in ip_addrs: + try: + sock = _create_socket_func(ip_addr, timeout, source_address) + # Explicitly break __traceback__ reference cycle + # https://bugs.python.org/issue36820 + err = None + return sock + except socket.error as e: + err = e + + try: + raise err + finally: + # Explicitly break __traceback__ reference cycle + # https://bugs.python.org/issue36820 + err = None diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 3c0647ecf9..c327f7744e 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -23,6 +23,7 @@ from ._helper import ( InstanceStoreMixin, add_accept_encoding_header, + create_connection, get_redirect_method, make_socks_proxy_opts, select_proxy, @@ -54,44 +55,10 @@ def _create_http_connection(http_class, source_address, *args, **kwargs): hc = http_class(*args, **kwargs) + if hasattr(hc, '_create_connection'): + hc._create_connection = create_connection + if source_address is not None: - # This is to workaround _create_connection() from socket where it will try all - # address data from getaddrinfo() including IPv6. This filters the result from - # getaddrinfo() based on the source_address value. - # This is based on the cpython socket.create_connection() function. - # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 - def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): - host, port = address - err = None - addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 - ip_addrs = [addr for addr in addrs if addr[0] == af] - if addrs and not ip_addrs: - ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise OSError( - "No remote IP%s addresses available for connect, can't use '%s' as source address" - % (ip_version, source_address[0])) - for res in ip_addrs: - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - sock.settimeout(timeout) - sock.bind(source_address) - sock.connect(sa) - err = None # Explicitly break reference cycle - return sock - except OSError as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise OSError('getaddrinfo returns an empty list') - if hasattr(hc, '_create_connection'): - hc._create_connection = _create_connection hc.source_address = (source_address, 0) return hc @@ -220,13 +187,28 @@ def make_socks_conn_class(base_class, socks_proxy): proxy_args = make_socks_proxy_opts(socks_proxy) class SocksConnection(base_class): - def connect(self): - self.sock = sockssocket() - self.sock.setproxy(**proxy_args) - if type(self.timeout) in (int, float): # noqa: E721 - self.sock.settimeout(self.timeout) - self.sock.connect((self.host, self.port)) + _create_connection = create_connection + def connect(self): + def sock_socket_connect(ip_addr, timeout, source_address): + af, socktype, proto, canonname, sa = ip_addr + sock = sockssocket(af, socktype, proto) + try: + connect_proxy_args = proxy_args.copy() + connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) + sock.setproxy(**connect_proxy_args) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect((self.host, self.port)) + return sock + except socket.error: + sock.close() + raise + self.sock = create_connection( + (proxy_args['addr'], proxy_args['port']), timeout=self.timeout, + source_address=self.source_address, _create_socket_func=sock_socket_connect) if isinstance(self, http.client.HTTPSConnection): self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) diff --git a/yt_dlp/socks.py b/yt_dlp/socks.py index f93328f63a..e7f41d7e2a 100644 --- a/yt_dlp/socks.py +++ b/yt_dlp/socks.py @@ -134,26 +134,31 @@ def _check_response_version(self, expected_version, got_version): self.close() raise InvalidVersionError(expected_version, got_version) - def _resolve_address(self, destaddr, default, use_remote_dns): - try: - return socket.inet_aton(destaddr) - except OSError: - if use_remote_dns and self._proxy.remote_dns: - return default - else: - return socket.inet_aton(socket.gethostbyname(destaddr)) + def _resolve_address(self, destaddr, default, use_remote_dns, family=None): + for f in (family,) if family else (socket.AF_INET, socket.AF_INET6): + try: + return f, socket.inet_pton(f, destaddr) + except OSError: + continue + + if use_remote_dns and self._proxy.remote_dns: + return 0, default + else: + res = socket.getaddrinfo(destaddr, None, family=family or 0) + f, _, _, _, ipaddr = res[0] + return f, socket.inet_pton(f, ipaddr[0]) def _setup_socks4(self, address, is_4a=False): destaddr, port = address - ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) + _, ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a, family=socket.AF_INET) packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr username = (self._proxy.username or '').encode() packet += username + b'\x00' - if is_4a and self._proxy.remote_dns: + if is_4a and self._proxy.remote_dns and ipaddr == SOCKS4_DEFAULT_DSTIP: packet += destaddr.encode() + b'\x00' self.sendall(packet) @@ -210,7 +215,7 @@ def _socks5_auth(self): def _setup_socks5(self, address): destaddr, port = address - ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) + family, ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) self._socks5_auth() @@ -220,8 +225,10 @@ def _setup_socks5(self, address): destaddr = destaddr.encode() packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME) packet += self._len_and_data(destaddr) - else: + elif family == socket.AF_INET: packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr + elif family == socket.AF_INET6: + packet += struct.pack('!B', Socks5AddressType.ATYP_IPV6) + ipaddr packet += struct.pack('!H', port) self.sendall(packet) From ba8e9eb2c8bbb699f314169fab8e544437ad731e Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:08:40 -0600 Subject: [PATCH 394/501] [ie/radiofrance] Add support for livestreams, podcasts, playlists (#7006) Closes #4282 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 9 +- yt_dlp/extractor/radiofrance.py | 379 +++++++++++++++++++++++++++++++- 2 files changed, 382 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bf0c67542e..ec3ae0e668 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1555,7 +1555,14 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE -from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiofrance import ( + FranceCultureIE, + RadioFranceIE, + RadioFranceLiveIE, + RadioFrancePodcastIE, + RadioFranceProfileIE, + RadioFranceProgramScheduleIE, +) from .radiozet import RadioZetPodcastIE from .radiokapital import ( RadioKapitalIE, diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 92e51b7f45..35f4b91dd2 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -1,7 +1,18 @@ +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..utils import parse_duration, unified_strdate +from ..utils import ( + int_or_none, + join_nonempty, + js_to_json, + parse_duration, + strftime_or_none, + traverse_obj, + unified_strdate, + urljoin, +) class RadioFranceIE(InfoExtractor): @@ -56,8 +67,32 @@ def _real_extract(self, url): } -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])' +class RadioFranceBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' + + _STATIONS_RE = '|'.join(map(re.escape, ( + 'franceculture', + 'franceinfo', + 'franceinter', + 'francemusique', + 'fip', + 'mouv', + ))) + + def _extract_data_from_webpage(self, webpage, display_id, key): + return traverse_obj(self._search_json( + r'\bconst\s+data\s*=', webpage, key, display_id, + contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json), + (..., 'data', key, {dict}), get_all=False) or {} + + +class FranceCultureIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#]) + ''' + _TESTS = [ { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', @@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor): 'ext': 'mp3', 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', - 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20220514', 'duration': 2750, }, }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', + 'info_dict': { + 'id': '2107675', + 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', + 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', + 'description': 'md5:36ee74351ede77a314fdebb94026b916', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20230310', + 'duration': 8977, + 'ext': 'mp3', + }, + }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', + 'only_matching': True, } ] @@ -89,7 +140,6 @@ def _real_extract(self, url): 'id': video_id, 'display_id': display_id, 'url': video_data['contentUrl'], - 'ext': video_data.get('encodingFormat'), 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'duration': parse_duration(video_data.get('duration')), 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', @@ -102,3 +152,322 @@ def _real_extract(self, url): 'upload_date': unified_strdate(self._search_regex( r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) } + + +class RadioFranceLiveIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + https?://(?:www\.)?radiofrance\.fr + /(?P<id>{RadioFranceBaseIE._STATIONS_RE}) + /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/', + 'info_dict': { + 'id': 'franceinter', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/franceculture', + 'info_dict': { + 'id': 'franceculture', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', + 'info_dict': { + 'id': 'mouv-radio-musique-kids-family', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', + 'info_dict': { + 'id': 'mouv-radio-rnb-soul', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', + 'info_dict': { + 'id': 'mouv-radio-musique-mix', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/fip/radio-rock', + 'info_dict': { + 'id': 'fip-radio-rock', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') + + if substation_id: + webpage = self._download_webpage(url, station_id) + api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData') + else: + api_response = self._download_json( + f'https://www.radiofrance.fr/{station_id}/api/live', station_id) + + formats, subtitles = [], {} + for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): + if media_source.get('format') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_source['url'], + 'abr': media_source.get('bitrate'), + }) + + return { + 'id': join_nonempty(station_id, substation_id), + 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( + ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class RadioFrancePlaylistBase(RadioFranceBaseIE): + """Subclasses must set _METADATA_KEY""" + + def _call_api(self, content_id, cursor, page_num): + raise NotImplementedError('This method must be implemented by subclasses') + + def _generate_playlist_entries(self, content_id, content_response): + for page_num in itertools.count(2): + for entry in content_response['items']: + yield self.url_result( + f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, { + 'title': 'title', + 'description': 'standFirst', + 'timestamp': ('publishedDate', {int_or_none}), + 'thumbnail': ('visual', 'src'), + })) + + next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False) + if not next_cursor: + break + + content_response = self._call_api(content_id, next_cursor, page_num) + + def _real_extract(self, url): + display_id = self._match_id(url) + + metadata = self._download_json( + 'https://www.radiofrance.fr/api/v2.1/path', display_id, + query={'value': urllib.parse.urlparse(url).path})['content'] + + content_id = metadata['id'] + + return self.playlist_result( + self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id, + display_id=display_id, **{**traverse_obj(metadata, { + 'title': 'title', + 'description': 'standFirst', + 'thumbnail': ('visual', 'src'), + }), **traverse_obj(metadata, { + 'title': 'name', + 'description': 'role', + })}) + + +class RadioFrancePodcastIE(RadioFrancePlaylistBase): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', + 'info_dict': { + 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', + 'display_id': 'le-billet-vert', + 'title': 'Le billet sciences', + 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale', + 'info_dict': { + 'id': '566fd524-3074-4fbc-ac69-8696f2152a54', + 'display_id': 'jean-marie-le-pen-l-obsession-nationale', + 'title': 'Jean-Marie Le Pen, l\'obsession nationale', + 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', + 'info_dict': { + 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', + 'display_id': 'serie-thomas-grjebine', + 'title': 'Thomas Grjebine', + }, + 'playlist_count': 1, + }, { + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', + 'info_dict': { + 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', + 'display_id': 'certains-l-aiment-fip', + 'title': 'Certains l’aiment Fip', + 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 321, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', + 'only_matching': True, + }] + + _METADATA_KEY = 'expressions' + + def _call_api(self, podcast_id, cursor, page_num): + return self._download_json( + f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id, + note=f'Downloading page {page_num}', query={'pageCursor': cursor}) + + +class RadioFranceProfileIE(RadioFrancePlaylistBase): + _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3', + 'info_dict': { + 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb', + 'display_id': 'thomas-pesquet', + 'title': 'Thomas Pesquet', + 'description': 'Astronaute à l\'agence spatiale européenne', + }, + 'playlist_mincount': 212, + }, { + 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie', + 'info_dict': { + 'id': '9593050b-0183-4972-a0b5-d8f699079e02', + 'display_id': 'eugenie-bastie', + 'title': 'Eugénie Bastié', + 'description': 'Journaliste et essayiste', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 39, + }, { + 'url': 'https://www.radiofrance.fr/personnes/lea-salame', + 'only_matching': True, + }] + + _METADATA_KEY = 'documents' + + def _call_api(self, profile_id, cursor, page_num): + resp = self._download_json( + f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id, + note=f'Downloading page {page_num}', query={ + 'relation': 'personality', + 'cursor': cursor, + }) + + resp['next'] = traverse_obj(resp, ('pagination', 'next')) + return resp + + +class RadioFranceProgramScheduleIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?P<station>{RadioFranceBaseIE._STATIONS_RE}) + /grille-programmes(?:\?date=(?P<date>[\d-]+))? + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023', + 'info_dict': { + 'id': 'franceinter-program-20230217', + 'upload_date': '20230217', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023', + 'info_dict': { + 'id': 'franceculture-program-20230201', + 'upload_date': '20230201', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023', + 'info_dict': { + 'id': 'mouv-program-20230319', + 'upload_date': '20230319', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023', + 'info_dict': { + 'id': 'francemusique-program-20230318', + 'upload_date': '20230318', + }, + 'playlist_count': 15, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes', + 'only_matching': True, + }] + + def _generate_playlist_entries(self, webpage_url, api_response): + for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])): + yield self.url_result( + urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE, + url_transparent=True, **traverse_obj(entry, { + 'title': ('expression', 'title'), + 'thumbnail': ('expression', 'visual', 'src'), + 'timestamp': ('startTime', {int_or_none}), + 'series_id': ('concept', 'id'), + 'series': ('concept', 'title'), + })) + + def _real_extract(self, url): + station, date = self._match_valid_url(url).group('station', 'date') + webpage = self._download_webpage(url, station) + grid_data = self._extract_data_from_webpage(webpage, station, 'grid') + upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d') + + return self.playlist_result( + self._generate_playlist_entries(url, grid_data), + join_nonempty(station, 'program', upload_date), upload_date=upload_date) From 9e68747f9607f05e92bb7d9b6e79d678b50070e1 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:02:00 -0400 Subject: [PATCH 395/501] [ie/bilibili] Add support for series, favorites and watch later (#7518) Closes #6719 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 6 +- yt_dlp/extractor/bilibili.py | 281 ++++++++++++++++++++++++++++++-- 2 files changed, 272 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ec3ae0e668..a6a286766f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -223,7 +223,11 @@ BiliBiliPlayerIE, BilibiliSpaceVideoIE, BilibiliSpaceAudioIE, - BilibiliSpacePlaylistIE, + BilibiliCollectionListIE, + BilibiliSeriesListIE, + BilibiliFavoritesListIE, + BilibiliWatchlaterIE, + BilibiliPlaylistIE, BiliIntlIE, BiliIntlSeriesIE, BiliLiveIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 290340078c..5e7042dbbd 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -15,6 +15,7 @@ GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, + bool_or_none, filter_dict, float_or_none, format_field, @@ -35,6 +36,7 @@ unsmuggle_url, url_or_none, urlencode_postdata, + variadic, ) @@ -156,7 +158,7 @@ def _get_episodes_from_season(self, ss_id, url): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -252,7 +254,7 @@ class BiliBiliIE(BilibiliBaseIE): 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', - 'uploader': '小夫Tech', + 'uploader': '小夫太渴', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, @@ -509,7 +511,7 @@ def _real_extract(self, url): class BiliBiliBangumiMediaIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { @@ -528,7 +530,7 @@ def _real_extract(self, url): class BiliBiliBangumiSeasonIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' + _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { @@ -679,13 +681,35 @@ def get_entries(page_data): return self.playlist_result(paged_list, playlist_id) -class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): - _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' +class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE): + def _get_entries(self, page_data, bvid_keys, ending_key='bvid'): + for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})): + yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid) + + def _get_uploader(self, uid, playlist_id): + webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False) + return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False) + + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries) + metadata.pop('page_count', None) + metadata.pop('page_size', None) + return metadata, page_list + + +class BilibiliCollectionListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)' _TESTS = [{ 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', 'info_dict': { 'id': '2142762_57445', - 'title': '《底特律 变人》' + 'title': '【完结】《底特律 变人》全结局流程解说', + 'description': '', + 'uploader': '老戴在此', + 'uploader_id': '2142762', + 'timestamp': int, + 'upload_date': str, + 'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg', }, 'playlist_mincount': 31, }] @@ -706,22 +730,251 @@ def get_metadata(page_data): return { 'page_count': math.ceil(entry_count / page_size), 'page_size': page_size, - 'title': traverse_obj(page_data, ('meta', 'name')) + 'uploader': self._get_uploader(mid, playlist_id), + **traverse_obj(page_data, { + 'title': ('meta', 'name', {str}), + 'description': ('meta', 'description', {str}), + 'uploader_id': ('meta', 'mid', {str_or_none}), + 'timestamp': ('meta', 'ptime', {int_or_none}), + 'thumbnail': ('meta', 'cover', {url_or_none}), + }) } def get_entries(page_data): - for entry in page_data.get('archives', []): - yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', - BiliBiliIE, entry['bvid']) + return self._get_entries(page_data, 'archives') metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) - return self.playlist_result(paged_list, playlist_id, metadata['title']) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliSeriesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0', + 'info_dict': { + 'id': '1958703906_547718', + 'title': '直播回放', + 'description': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + 'modified_timestamp': int, + 'modified_date': str, + }, + 'playlist_mincount': 513, + }] + + def _real_extract(self, url): + mid, sid = self._match_valid_url(url).group('mid', 'sid') + playlist_id = f'{mid}_{sid}' + playlist_meta = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False + ), { + 'title': ('data', 'meta', 'name', {str}), + 'description': ('data', 'meta', 'description', {str}), + 'uploader_id': ('data', 'meta', 'mid', {str_or_none}), + 'timestamp': ('data', 'meta', 'ctime', {int_or_none}), + 'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}), + }) + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/series/archives', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data'] + + def get_metadata(page_data): + page_size = page_data['page']['size'] + entry_count = page_data['page']['total'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + 'uploader': self._get_uploader(mid, playlist_id), + **playlist_meta + } + + def get_entries(page_data): + return self._get_entries(page_data, 'archives') + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create', + 'info_dict': { + 'id': '1103407912', + 'title': '【V2】(旧)', + 'description': '', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'modified_timestamp': int, + 'modified_date': str, + 'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg", + 'view_count': int, + 'like_count': int, + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/detail/ml1103407912', + 'only_matching': True, + }] + + def _real_extract(self, url): + fid = self._match_id(url) + + list_info = self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20', + fid, note='Downloading favlist metadata') + if list_info['code'] == -403: + self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner') + + entries = self._get_entries(self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}', + fid, note='Download favlist entries'), 'data') + + return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', { + 'title': ('title', {str}), + 'description': ('intro', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'modified_timestamp': ('mtime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + 'view_count': ('cnt_info', 'play', {int_or_none}), + 'like_count': ('cnt_info', 'thumb_up', {int_or_none}), + }))) + + +class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/watchlater/#/list', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _real_extract(self, url): + list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater') + watchlater_info = self._download_json( + 'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id) + if watchlater_info['code'] == -101: + self.raise_login_required(msg='You need to login to access your watchlater list') + entries = self._get_entries(watchlater_info, ('data', 'list')) + return self.playlist_result(entries, id=list_id, title='稍后再看') + + +class BilibiliPlaylistIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/list/1958703906?sid=547718', + 'info_dict': { + 'id': '5_547718', + 'title': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + }, + 'playlist_mincount': 513, + }, { + 'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1', + 'info_dict': { + 'id': '5_547718', + }, + 'playlist_mincount': 513, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + 'title': '【V2】(旧)', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg", + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/play/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + }, + 'playlist_mincount': 22, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }, { + 'url': 'https://www.bilibili.com/medialist/play/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _extract_medialist(self, query, list_id): + for page_num in itertools.count(1): + page_data = self._download_json( + 'https://api.bilibili.com/x/v2/medialist/resource/list', + list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}' + )['data'] + yield from self._get_entries(page_data, 'media_list', ending_key='bv_id') + query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id')) + if not page_data.get('has_more', False): + break + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id) + if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200: + error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none})) + error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none})) + if error_code == -400 and list_id == 'watchlater': + self.raise_login_required('You need to login to access your watchlater playlist') + elif error_code == -403: + self.raise_login_required('This is a private playlist. You need to login as its owner') + elif error_code == 11010: + raise ExtractorError('Playlist is no longer available', expected=True) + raise ExtractorError(f'Could not access playlist: {error_code} {error_message}') + + query = { + 'ps': 20, + 'with_current': False, + **traverse_obj(initial_state, { + 'type': ('playlist', 'type', {int_or_none}), + 'biz_id': ('playlist', 'id', {int_or_none}), + 'tid': ('tid', {int_or_none}), + 'sort_field': ('sortFiled', {int_or_none}), + 'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}), + }) + } + metadata = { + 'id': f'{query["type"]}_{query["biz_id"]}', + **traverse_obj(initial_state, ('mediaListInfo', { + 'title': ('title', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + })), + } + return self.playlist_result(self._extract_medialist(query, list_id), **metadata) class BilibiliCategoryIE(InfoExtractor): IE_NAME = 'Bilibili category extractor' _MAX_RESULTS = 1000000 - _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' _TESTS = [{ 'url': 'https://www.bilibili.com/v/kichiku/mad', 'info_dict': { @@ -1406,7 +1659,7 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' + _VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', From 69b03f84f8378b0b5a2fbae56f9b7d860b2f529e Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:06:36 -0400 Subject: [PATCH 396/501] [ie/weibo] Fix extractor and support user extraction (#7657) Closes #3964, Closes #4673, Closes #6979 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/weibo.py | 319 +++++++++++++++++++++----------- 2 files changed, 215 insertions(+), 107 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a6a286766f..47d983c9cc 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2371,7 +2371,8 @@ ) from .weibo import ( WeiboIE, - WeiboMobileIE + WeiboVideoIE, + WeiboUserIE, ) from .weiqitv import WeiqiTVIE from .weverse import ( diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index bc9a71abe0..b0c3052b6a 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -1,134 +1,241 @@ -from .common import InfoExtractor - -import json import random -import re +import itertools +import urllib.parse -from ..compat import ( - compat_parse_qs, - compat_str, -) +from .common import InfoExtractor from ..utils import ( - js_to_json, + int_or_none, + make_archive_id, + mimetype2ext, + parse_resolution, + str_or_none, strip_jsonp, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, ) -class WeiboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' - _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', - 'info_dict': { - 'id': 'Fp6RGfbff', - 'ext': 'mp4', - 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', - } - } +class WeiboBaseIE(InfoExtractor): + def _update_visitor_cookies(self, video_id): + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit guest request', + transform_source=strip_jsonp, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', + })) - def _real_extract(self, url): - video_id = self._match_id(url) - # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id) - - visitor_url = urlh.url - - if 'passport.weibo.com' in visitor_url: - # first visit - visitor_data = self._download_json( - 'https://passport.weibo.com/visitor/genvisitor', video_id, - note='Generating first-visit data', - transform_source=strip_jsonp, - headers={'Referer': visitor_url}, - data=urlencode_postdata({ - 'cb': 'gen_callback', - 'fp': json.dumps({ - 'os': '2', - 'browser': 'Gecko57,0,0,0', - 'fonts': 'undefined', - 'screenInfo': '1440*900*24', - 'plugins': '', - }), - })) - - tid = visitor_data['data']['tid'] - cnfd = '%03d' % visitor_data['data']['confidence'] - - self._download_webpage( - 'https://passport.weibo.com/visitor/visitor', video_id, - note='Running first-visit callback', - query={ - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random(), - }) - - webpage = self._download_webpage( - url, video_id, note='Revisiting webpage') - - title = self._html_extract_title(webpage) - - video_formats = compat_parse_qs(self._search_regex( - r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) - - formats = [] - supported_resolutions = (480, 720) - for res in supported_resolutions: - vid_urls = video_formats.get(compat_str(res)) - if not vid_urls or not isinstance(vid_urls, list): - continue - - vid_url = vid_urls[0] - formats.append({ - 'url': vid_url, - 'height': res, + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback to get guest cookies', + query={ + 'a': 'incarnate', + 't': visitor_data['data']['tid'], + 'w': 2, + 'c': '%03d' % visitor_data['data']['confidence'], + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), }) - uploader = self._og_search_property( - 'nick-name', webpage, 'uploader', default=None) + def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): + webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) + if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': + self._update_visitor_cookies(video_id) + webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) + return self._parse_json(webpage, video_id, fatal=fatal) + def _extract_formats(self, video_info): + media_info = traverse_obj(video_info, ('page_info', 'media_info')) + formats = traverse_obj(media_info, ( + 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', { + 'url': 'url', + 'format': ('quality_desc', {str}), + 'format_id': ('label', {str}), + 'ext': ('mime', {mimetype2ext}), + 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}), + 'vcodec': ('video_codecs', {str}), + 'fps': ('fps', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'acodec': ('audio_codecs', {str}), + 'asr': ('audio_sample_rate', {int_or_none}), + 'audio_channels': ('audio_channels', {int_or_none}), + })) + if not formats: # fallback, should be barely used + for url in set(traverse_obj(media_info, (..., {url_or_none}))): + if 'label=' in url: # filter out non-video urls + format_id, resolution = self._search_regex( + r'label=(\w+)&template=(\d+x\d+)', url, 'format info', + group=(1, 2), default=(None, None)) + formats.append({ + 'url': url, + 'format_id': format_id, + **parse_resolution(resolution), + **traverse_obj(media_info, ( + 'video_details', lambda _, v: v['label'].startswith(format_id), { + 'size': ('size', {int_or_none}), + 'tbr': ('bitrate', {int_or_none}), + } + ), get_all=False), + }) + return formats + + def _parse_video_info(self, video_info, video_id=None): return { 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats + 'extractor_key': WeiboIE.ie_key(), + 'extractor': WeiboIE.IE_NAME, + 'formats': self._extract_formats(video_info), + 'http_headers': {'Referer': 'https://weibo.com/'}, + '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)], + **traverse_obj(video_info, { + 'id': (('id', 'id_str', 'mid'), {str_or_none}), + 'display_id': ('mblogid', {str_or_none}), + 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}), + 'description': ('text_raw', {str}), + 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), + 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}), + 'thumbnail': ('page_info', 'page_pic', {url_or_none}), + 'uploader': ('user', 'screen_name', {str}), + 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}), + 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}), + 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}), + 'like_count': ('attitudes_count', {int_or_none}), + 'repost_count': ('reposts_count', {int_or_none}), + }, get_all=False), + 'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None, } -class WeiboMobileIE(InfoExtractor): - _VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?' - _TEST = { - 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', +class WeiboIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://weibo.com/7827771738/N4xlMvjhI', + 'info_dict': { + 'id': '4910815147462302', + 'ext': 'mp4', + 'display_id': 'N4xlMvjhI', + 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】', + 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f', + 'duration': 918, + 'timestamp': 1686312819, + 'upload_date': '20230609', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '睡前视频基地', + 'uploader_id': '7827771738', + 'uploader_url': 'https://weibo.com/u/7827771738', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'], + }, + }, { + 'url': 'https://m.weibo.cn/status/4189191225395228', 'info_dict': { 'id': '4189191225395228', 'ext': 'mp4', - 'title': '午睡当然是要甜甜蜜蜜的啦', - 'uploader': '柴犬柴犬' + 'display_id': 'FBqgOmDxO', + 'title': '柴犬柴犬的秒拍视频', + 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f', + 'duration': 53, + 'timestamp': 1514264429, + 'upload_date': '20171226', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '柴犬柴犬', + 'uploader_id': '5926682210', + 'uploader_url': 'https://weibo.com/u/5926682210', + 'view_count': int, + 'like_count': int, + 'repost_count': int, } - } + }, { + 'url': 'https://weibo.com/0/4224132150961381', + 'note': 'no playback_list example', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note='visit the page') - weibo_info = self._parse_json(self._search_regex( - r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', - webpage, 'js_code', flags=re.DOTALL), - video_id, transform_source=js_to_json) + return self._parse_video_info(self._weibo_download_json( + f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id)) - status_data = weibo_info.get('status', {}) - page_info = status_data.get('page_info') - title = status_data['status_title'] - uploader = status_data.get('user', {}).get('screen_name') - return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'url': page_info['media_info']['stream_url'] +class WeiboVideoIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)' + _TESTS = [{ + 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow', + 'info_dict': { + 'id': '4797700463137878', + 'ext': 'mp4', + 'display_id': 'LEZDodaiW', + 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', + 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ​​​', + 'duration': 76, + 'timestamp': 1659344278, + 'upload_date': '20220801', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '君子爱财陈平安', + 'uploader_id': '3905382233', + 'uploader_url': 'https://weibo.com/u/3905382233', + 'view_count': int, + 'like_count': int, + 'repost_count': int, } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode() + video_info = self._weibo_download_json( + f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}', + video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo'] + return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE) + + +class WeiboUserIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://weibo.com/u/2066652961?tabtype=video', + 'info_dict': { + 'id': '2066652961', + 'title': '萧影殿下的视频', + 'description': '萧影殿下的全部视频', + 'uploader': '萧影殿下', + }, + 'playlist_mincount': 195, + }] + + def _fetch_page(self, uid, cursor=0, page=1): + return self._weibo_download_json( + 'https://weibo.com/ajax/profile/getWaterFallContent', + uid, note=f'Downloading videos page {page}', + query={'uid': uid, 'cursor': cursor})['data'] + + def _entries(self, uid, first_page): + cursor = 0 + for page in itertools.count(1): + response = first_page if page == 1 else self._fetch_page(uid, cursor, page) + for video_info in traverse_obj(response, ('list', ..., {dict})): + yield self._parse_video_info(video_info) + cursor = response.get('next_cursor') + if (int_or_none(cursor) or -1) < 0: + break + + def _real_extract(self, url): + uid = self._match_id(url) + first_page = self._fetch_page(uid) + uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False) + metainfo = { + 'title': f'{uploader}的视频', + 'description': f'{uploader}的全部视频', + 'uploader': uploader, + } if uploader else {} + + return self.playlist_result(self._entries(uid, first_page), uid, **metainfo) From 8ac5b6d96ae5c60cd5ae2495949e0068a6754c45 Mon Sep 17 00:00:00 2001 From: u-spec-png <srdjankalaba@protonmail.ch> Date: Tue, 19 Sep 2023 01:36:10 +0200 Subject: [PATCH 397/501] [ie/N1Info:article] Fix extractor (#7373) Authored by: u-spec-png --- yt_dlp/extractor/n1.py | 52 +++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index 55345f3983..edc41443ab 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -33,7 +33,7 @@ def _real_extract(self, url): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor): 'upload_date': '20211102', 'timestamp': 1635861677, }, + }, { + 'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/', + 'info_dict': { + 'id': '1332368', + 'ext': 'mp4', + 'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama', + 'upload_date': '20230620', + 'timestamp': 1687290536, + 'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg' + }, }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, @@ -105,19 +115,35 @@ def _real_extract(self, url): title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) - - videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + plugin_data = self._html_search_meta('BridPlugin', webpage) entries = [] - for video in videos: - video_data = extract_attributes(video) - entries.append({ - '_type': 'url_transparent', - 'url': video_data.get('data-url'), - 'id': video_data.get('id'), - 'title': title, - 'thumbnail': video_data.get('data-thumbnail'), - 'timestamp': timestamp, - 'ie_key': 'N1InfoAsset'}) + if plugin_data: + site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id') + for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage): + video_id = self._parse_json(video_data, title)['video'] + entries.append({ + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'thumbnail': self._html_search_meta('thumbnailURL', webpage), + 'formats': self._extract_m3u8_formats( + f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8', + video_id, fatal=False), + }) + else: + # Old player still present in older articles + videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + for video in videos: + video_data = extract_attributes(video) + entries.append({ + '_type': 'url_transparent', + 'url': video_data.get('data-url'), + 'id': video_data.get('id'), + 'title': title, + 'thumbnail': video_data.get('data-thumbnail'), + 'timestamp': timestamp, + 'ie_key': 'N1InfoAsset', + }) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) for embedded_video in embedded_videos: From 40999467f72db074a3f13057da9bf82a857530fe Mon Sep 17 00:00:00 2001 From: niemands <67282402+niemands@users.noreply.github.com> Date: Tue, 19 Sep 2023 01:37:17 +0200 Subject: [PATCH 398/501] [ie/pornbox] Add extractor (#7386) Authored by: niemands --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/pornbox.py | 113 ++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 yt_dlp/extractor/pornbox.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 47d983c9cc..dd670d59c2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1505,6 +1505,7 @@ from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE from .porn91 import Porn91IE +from .pornbox import PornboxIE from .porncom import PornComIE from .pornflip import PornFlipIE from .pornhd import PornHdIE diff --git a/yt_dlp/extractor/pornbox.py b/yt_dlp/extractor/pornbox.py new file mode 100644 index 0000000000..c381382e93 --- /dev/null +++ b/yt_dlp/extractor/pornbox.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..compat import functools +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + qualities, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PornboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://pornbox.com/application/watch-page/212108', + 'md5': '3ff6b6e206f263be4c5e987a3162ac6e', + 'info_dict': { + 'id': '212108', + 'ext': 'mp4', + 'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49', + 'uploader': 'Lily Strong', + 'timestamp': 1665871200, + 'upload_date': '20221015', + 'age_limit': 18, + 'availability': 'needs_auth', + 'duration': 1505, + 'cast': ['Lily Strong', 'John Strong'], + 'tags': 'count:11', + 'description': 'md5:589c7f33e183aa8aa939537300efb859', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$' + } + }, { + 'url': 'https://pornbox.com/application/watch-page/216045', + 'info_dict': { + 'id': '216045', + 'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2', + 'description': 'md5:3e631dcaac029f15ed434e402d1b06c7', + 'uploader': 'VK Studio', + 'timestamp': 1618264800, + 'upload_date': '20210412', + 'age_limit': 18, + 'availability': 'premium_only', + 'duration': 2710, + 'cast': 'count:3', + 'tags': 'count:29', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$', + 'subtitles': 'count:6' + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True + }, + 'expected_warnings': [ + 'You are either not logged in or do not have access to this scene', + 'No video formats found', 'Requested format is not available'] + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id) + + subtitles = {country_code: [{ + 'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}', + 'ext': 'srt' + }] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))} + + is_free_scene = traverse_obj( + public_data, ('price', 'is_available_for_free', {bool}), default=False) + + metadata = { + 'id': video_id, + **traverse_obj(public_data, { + 'title': ('scene_name', {str.strip}), + 'description': ('small_description', {str.strip}), + 'uploader': 'studio', + 'duration': ('runtime', {parse_duration}), + 'cast': (('models', 'male_models'), ..., 'model_name'), + 'thumbnail': ('player_poster', {url_or_none}), + 'tags': ('niches', ..., 'niche'), + }), + 'age_limit': 18, + 'timestamp': parse_iso8601(traverse_obj( + public_data, ('studios', 'release_date'), 'publish_date')), + 'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene), + 'subtitles': subtitles, + } + + if not public_data.get('is_purchased') or not is_free_scene: + self.raise_login_required( + 'You are either not logged in or do not have access to this scene', metadata_available=True) + return metadata + + media_id = traverse_obj(public_data, ( + 'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False) + if not media_id: + self.raise_no_formats('Could not find stream id', video_id=video_id) + + stream_data = self._download_json( + f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls') + + get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k']) + metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], { + 'url': 'src', + 'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'format_id': ('quality', {str_or_none}), + 'quality': ('quality', {get_quality}), + 'width': ('size', {lambda x: int(x[:-1])}), + })) + + return metadata From cf11b40ac40e3d23a6352753296f3a732886efb9 Mon Sep 17 00:00:00 2001 From: Rohan Dey <142105763+Rohxn16@users.noreply.github.com> Date: Mon, 18 Sep 2023 23:39:20 +0000 Subject: [PATCH 399/501] [ie/media.ccc.de:lists] Fix extraction (#8144) Closes #8138 Authored by: Rohxn16 --- yt_dlp/extractor/ccc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ccc.py b/yt_dlp/extractor/ccc.py index 22e3a22ece..ca6b82c981 100644 --- a/yt_dlp/extractor/ccc.py +++ b/yt_dlp/extractor/ccc.py @@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor): 'id': '30c3', }, 'playlist_count': 135, + }, { + 'url': 'https://media.ccc.de/c/DS2023', + 'info_dict': { + 'title': 'Datenspuren 2023', + 'id': 'DS2023', + }, + 'playlist_count': 37 }] def _real_extract(self, url): - playlist_id = self._match_id(url).lower() + playlist_id = self._match_id(url) conf = self._download_json( 'https://media.ccc.de/public/conferences/' + playlist_id, From b532556d0a85e7d76f8f0880861232fb706ddbc5 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Tue, 19 Sep 2023 21:52:44 +0200 Subject: [PATCH 400/501] [ie/pr0gramm] Rewrite extractor (#8151) Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/pr0gramm.py | 218 ++++++++++++++++++++------------ 2 files changed, 139 insertions(+), 81 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index dd670d59c2..490b010b8d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1524,7 +1524,7 @@ PuhuTVIE, PuhuTVSerieIE, ) -from .pr0gramm import Pr0grammStaticIE, Pr0grammIE +from .pr0gramm import Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 2eb327fba1..c8e0bb493b 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,97 +1,155 @@ -import re +import json +from datetime import date +from urllib.parse import unquote from .common import InfoExtractor -from ..utils import merge_dicts +from ..compat import functools +from ..utils import ExtractorError, make_archive_id, urljoin +from ..utils.traversal import traverse_obj -class Pr0grammStaticIE(InfoExtractor): - # Possible urls: - # https://pr0gramm.com/static/5466437 - _VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://pr0gramm.com/static/5466437', - 'md5': '52fa540d70d3edc286846f8ca85938aa', - 'info_dict': { - 'id': '5466437', - 'ext': 'mp4', - 'title': 'pr0gramm-5466437 by g11st', - 'uploader': 'g11st', - 'upload_date': '20221221', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # Fetch media sources - entries = self._parse_html5_media_entries(url, webpage, video_id) - media_info = entries[0] - - # Fetch author - uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') - - # Fetch approx upload timestamp from filename - # Have None-defaults in case the extraction fails - uploadDay = None - uploadMon = None - uploadYear = None - uploadTimestr = None - # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) - m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage) - - if (m): - # Up to a day of accuracy should suffice... - uploadDay = m.groupdict().get('day') - uploadMon = m.groupdict().get('mon') - uploadYear = m.groupdict().get('year') - uploadTimestr = uploadYear + uploadMon + uploadDay - - return merge_dicts({ - 'id': video_id, - 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), - 'uploader': uploader, - 'upload_date': uploadTimestr - }, media_info) - - -# This extractor is for the primary url (used for sharing, and appears in the -# location bar) Since this page loads the DOM via JS, yt-dl can't find any -# video information here. So let's redirect to a compatibility version of -# the site, which does contain the <video>-element by itself, without requiring -# js to be ran. class Pr0grammIE(InfoExtractor): - # Possible urls: - # https://pr0gramm.com/new/546637 - # https://pr0gramm.com/new/video/546637 - # https://pr0gramm.com/top/546637 - # https://pr0gramm.com/top/video/546637 - # https://pr0gramm.com/user/g11st/uploads/5466437 - # https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290 - # https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030 - # https://pr0gramm.com/user/froschler/1elf/5232030 - # https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id! - # https://pr0gramm.com/top/fruher war alles damals/5498175 - - _VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)' - _TEST = { + _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)' + _TESTS = [{ + # Tags require account 'url': 'https://pr0gramm.com/new/video/5466437', 'info_dict': { 'id': '5466437', 'ext': 'mp4', 'title': 'pr0gramm-5466437 by g11st', + 'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'], 'uploader': 'g11st', + 'uploader_id': 394718, + 'upload_timestamp': 1671590240, 'upload_date': '20221221', - } - } + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + }, + }, { + # Tags require account + 'url': 'https://pr0gramm.com/new/3052805:comment28391322', + 'info_dict': { + 'id': '3052805', + 'ext': 'mp4', + 'title': 'pr0gramm-3052805 by Hansking1', + 'tags': 'count:15', + 'uploader': 'Hansking1', + 'uploader_id': 385563, + 'upload_timestamp': 1552930408, + 'upload_date': '20190318', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + }, + }, { + # Requires verified account + 'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332', + 'info_dict': { + 'id': '5848332', + 'ext': 'mp4', + 'title': 'pr0gramm-5848332 by erd0pfel', + 'tags': 'count:18', + 'uploader': 'erd0pfel', + 'uploader_id': 349094, + 'upload_timestamp': 1694489652, + 'upload_date': '20230912', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + }, + }, { + 'url': 'https://pr0gramm.com/static/5466437', + 'only_matching': True, + }, { + 'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805', + 'only_matching': True, + }, { + 'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290', + 'only_matching': True, + }] - def _generic_title(): - return "oof" + BASE_URL = 'https://pr0gramm.com' + + @functools.cached_property + def _is_logged_in(self): + return 'pp' in self._get_cookies(self.BASE_URL) + + @functools.cached_property + def _maximum_flags(self): + # We need to guess the flags for the content otherwise the api will raise an error + # We can guess the maximum allowed flags for the account from the cookies + # Bitflags are (msbf): nsfp, nsfl, nsfw, sfw + flags = 0b0001 + if self._is_logged_in: + flags |= 0b1000 + cookies = self._get_cookies(self.BASE_URL) + if 'me' not in cookies: + self._download_webpage(self.BASE_URL, None, 'Refreshing verification information') + if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')): + flags |= 0b0110 + + return flags + + def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'): + data = self._download_json( + f'https://pr0gramm.com/api/items/{endpoint}', + video_id, note, query=query, expected_status=403) + + error = traverse_obj(data, ('error', {str})) + if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'): + if not self._is_logged_in: + self.raise_login_required() + raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True) + elif error: + message = traverse_obj(data, ('msg', {str})) or error + raise ExtractorError(f'API returned error: {message}', expected=True) + + return data def _real_extract(self, url): video_id = self._match_id(url) + video_info = traverse_obj( + self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}), + ('items', 0, {dict})) - return self.url_result( - 'https://pr0gramm.com/static/' + video_id, - video_id=video_id, - ie=Pr0grammStaticIE.ie_key()) + source = urljoin('https://img.pr0gramm.com', video_info.get('image')) + if not source or not source.endswith('mp4'): + self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id) + + tags = None + if self._is_logged_in: + metadata = self._call_api('info', video_id, {'itemId': video_id}) + tags = traverse_obj(metadata, ('tags', ..., 'tag', {str})) + # Sorted by "confidence", higher confidence = earlier in list + confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) + if confidences: + tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] + + return { + 'id': video_id, + 'title': f'pr0gramm-{video_id} by {video_info.get("user")}', + 'formats': [{ + 'url': source, + 'ext': 'mp4', + **traverse_obj(video_info, { + 'width': ('width', {int}), + 'height': ('height', {int}), + }), + }], + 'tags': tags, + 'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0, + '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)], + **traverse_obj(video_info, { + 'uploader': ('user', {str}), + 'uploader_id': ('userId', {int}), + 'like_count': ('up', {int}), + 'dislike_count': ('down', {int}), + 'upload_timestamp': ('created', {int}), + 'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}), + 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}) + }), + } From 9d6254069c75877bc88bc3584f4326fb1853a543 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Wed, 20 Sep 2023 19:14:10 +0000 Subject: [PATCH 401/501] Update to ytdl-commit-66ab08 (#8128) [utils] Revert bbd3e7e, updating docstring, test instead https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9 Authored by: coletdjnz --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7b73f4fd6..d94d8ea822 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ # NEW FEATURES -* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) +* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@66ab08**](https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API From 35f9a306e6934793cff100200cd03f288ec33f11 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:58:53 -0500 Subject: [PATCH 402/501] [dependencies] Handle deprecation of `sqlite3.version` (#8167) Closes #8152 Authored by: bashonly --- yt_dlp/compat/compat_utils.py | 2 +- yt_dlp/dependencies/__init__.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/compat/compat_utils.py b/yt_dlp/compat/compat_utils.py index 3ca46d270c..d62b7d0488 100644 --- a/yt_dlp/compat/compat_utils.py +++ b/yt_dlp/compat/compat_utils.py @@ -15,7 +15,7 @@ def get_package_info(module): name=getattr(module, '_yt_dlp__identifier', module.__name__), version=str(next(filter(None, ( getattr(module, attr, None) - for attr in ('__version__', 'version_string', 'version') + for attr in ('_yt_dlp__version', '__version__', 'version_string', 'version') )), None))) diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py index 6e7d29c5ca..b56e4f5cc6 100644 --- a/yt_dlp/dependencies/__init__.py +++ b/yt_dlp/dependencies/__init__.py @@ -43,6 +43,8 @@ try: import sqlite3 + # We need to get the underlying `sqlite` version, see https://github.com/yt-dlp/yt-dlp/issues/8152 + sqlite3._yt_dlp__version = sqlite3.sqlite_version except ImportError: # although sqlite3 is part of the standard library, it is possible to compile python without # sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544 From 295fbb3ae3a7d0dd50e286be5c487cf145ed5778 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Fri, 22 Sep 2023 01:28:20 +0800 Subject: [PATCH 403/501] [ie/eplus:inbound] Add extractor (#5782) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/eplus.py | 96 +++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 yt_dlp/extractor/eplus.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 490b010b8d..3ce6baef2f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -565,6 +565,7 @@ EpiconIE, EpiconSeriesIE, ) +from .eplus import EplusIbIE from .epoch import EpochIE from .eporner import EpornerIE from .eroprofile import ( diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py new file mode 100644 index 0000000000..3ebdcf5fbe --- /dev/null +++ b/yt_dlp/extractor/eplus.py @@ -0,0 +1,96 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_call, + unified_timestamp, +) + + +class EplusIbIE(InfoExtractor): + IE_NAME = 'eplus:inbound' + IE_DESC = 'e+ (イープラス) overseas' + _VALID_URL = r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)' + _TESTS = [{ + 'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', + 'info_dict': { + 'id': '354502-0001-002', + 'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022~LIVE with a smile!~【Streaming+(配信)】', + 'live_status': 'was_live', + 'release_date': '20211231', + 'release_timestamp': 1640952000, + 'description': str, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + ], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id) + + delivery_status = data_json.get('delivery_status') + archive_mode = data_json.get('archive_mode') + release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) + release_timestamp_str = data_json.get('event_datetime_text') # JST + + self.write_debug(f'delivery_status = {delivery_status}, archive_mode = {archive_mode}') + + if delivery_status == 'PREPARING': + live_status = 'is_upcoming' + elif delivery_status == 'STARTED': + live_status = 'is_live' + elif delivery_status == 'STOPPED': + if archive_mode != 'ON': + raise ExtractorError( + 'This event has ended and there is no archive for this event', expected=True) + live_status = 'post_live' + elif delivery_status == 'WAIT_CONFIRM_ARCHIVED': + live_status = 'post_live' + elif delivery_status == 'CONFIRMED_ARCHIVE': + live_status = 'was_live' + else: + self.report_warning(f'Unknown delivery_status {delivery_status}, treat it as a live') + live_status = 'is_live' + + formats = [] + + m3u8_playlist_urls = self._search_json( + r'var listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[]) + if not m3u8_playlist_urls: + if live_status == 'is_upcoming': + self.raise_no_formats( + f'Could not find the playlist URL. This live event will begin at {release_timestamp_str} JST', expected=True) + else: + self.raise_no_formats( + 'Could not find the playlist URL. This event may not be accessible', expected=True) + elif live_status == 'is_upcoming': + self.raise_no_formats(f'This live event will begin at {release_timestamp_str} JST', expected=True) + elif live_status == 'post_live': + self.raise_no_formats('This event has ended, and the archive will be available shortly', expected=True) + else: + for m3u8_playlist_url in m3u8_playlist_urls: + formats.extend(self._extract_m3u8_formats(m3u8_playlist_url, video_id)) + # FIXME: HTTP request headers need to be updated to continue download + warning = 'Due to technical limitations, the download will be interrupted after one hour' + if live_status == 'is_live': + self.report_warning(warning) + elif live_status == 'was_live': + self.report_warning(f'{warning}. You can restart to continue the download') + + return { + 'id': data_json['app_id'], + 'title': data_json.get('app_name'), + 'formats': formats, + 'live_status': live_status, + 'description': data_json.get('content'), + 'release_timestamp': release_timestamp, + } From b3febedbeb662dfdf9b5c1d5799039ad4fc969de Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:30:32 -0600 Subject: [PATCH 404/501] [ie/Canal1,CaracolTvPlay] Add extractors (#7151) Closes #5826 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/canal1.py | 39 +++++++++ yt_dlp/extractor/caracoltv.py | 136 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/mediastream.py | 8 +- 4 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/canal1.py create mode 100644 yt_dlp/extractor/caracoltv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3ce6baef2f..632d6720e1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -296,9 +296,11 @@ from .camsoda import CamsodaIE from .camtasia import CamtasiaEmbedIE from .camwithher import CamWithHerIE +from .canal1 import Canal1IE from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .caracoltv import CaracolTvPlayIE from .carambatv import ( CarambaTVIE, CarambaTVPageIE, diff --git a/yt_dlp/extractor/canal1.py b/yt_dlp/extractor/canal1.py new file mode 100644 index 0000000000..587a11ab8c --- /dev/null +++ b/yt_dlp/extractor/canal1.py @@ -0,0 +1,39 @@ +from .common import InfoExtractor + + +class Canal1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|noticias\.)?canal1\.com\.co/(?:[^?#&])+/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://canal1.com.co/noticias/napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco/', + 'info_dict': { + 'id': '63b39f6b354977084b85ab54', + 'display_id': 'napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco', + 'title': 'Ñapa I Una cadena de producción de arroz que se quedó en veremos y abandonada en el departamento del Chocó', + 'description': 'md5:bc49c6d64d20610ea1e7daf079a0d013', + 'thumbnail': r're:^https?://[^?#]+63b39f6b354977084b85ab54', + 'ext': 'mp4', + }, + }, { + 'url': 'https://noticias.canal1.com.co/noticias/tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter/', + 'info_dict': { + 'id': '63b39e93f5fd223aa32250fb', + 'display_id': 'tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter', + 'title': 'Tres I El triste récord que impuso Elon Musk, el dueño de Tesla y de Twitter', + 'description': 'md5:d9f691f131a21ce6767ca6c05d17d791', + 'thumbnail': r're:^https?://[^?#]+63b39e93f5fd223aa32250fb', + 'ext': 'mp4', + }, + }, { + # Geo-restricted to Colombia + 'url': 'https://canal1.com.co/programas/guerreros-canal-1/video-inedito-guerreros-despedida-kewin-zarate/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self.url_result( + self._search_regex(r'"embedUrl"\s*:\s*"([^"]+)', webpage, 'embed url'), + display_id=display_id, url_transparent=True) diff --git a/yt_dlp/extractor/caracoltv.py b/yt_dlp/extractor/caracoltv.py new file mode 100644 index 0000000000..79f7752fe0 --- /dev/null +++ b/yt_dlp/extractor/caracoltv.py @@ -0,0 +1,136 @@ +import base64 +import json +import uuid + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + traverse_obj, + urljoin, +) + + +class CaracolTvPlayIE(InfoExtractor): + _VALID_URL = r'https?://play\.caracoltv\.com/videoDetails/(?P<id>[^/?#]+)' + _NETRC_MACHINE = 'caracoltv-play' + + _TESTS = [{ + 'url': 'https://play.caracoltv.com/videoDetails/OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==', + 'info_dict': { + 'id': 'OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==', + 'title': 'La teoría del promedio', + 'description': 'md5:1cdd6d2c13f19ef0d9649ab81a023ac3', + }, + 'playlist_count': 6, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==/ella?season=0', + 'info_dict': { + 'id': 'OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==', + 'title': 'Ella', + 'description': 'md5:a639b1feb5ddcc0cff92a489b4e544b8', + }, + 'playlist_count': 10, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==/la-vuelta-al-mundo-en-80-risas-2022?season=0', + 'info_dict': { + 'id': 'OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==', + 'title': 'La vuelta al mundo en 80 risas 2022', + 'description': 'md5:e97aac36106e5c37ebf947b3350106a4', + }, + 'playlist_count': 17, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/MzoxX3BwbjRmNjB1', + 'only_matching': True, + }] + + _USER_TOKEN = None + + def _extract_app_token(self, webpage): + config_js_path = self._search_regex( + r'<script[^>]+src\s*=\s*"([^"]+coreConfig.js[^"]+)', webpage, 'config js url', fatal=False) + + mediation_config = {} if not config_js_path else self._search_json( + r'mediation\s*:', self._download_webpage( + urljoin('https://play.caracoltv.com/', config_js_path), None, fatal=False, note='Extracting JS config'), + 'mediation_config', None, transform_source=js_to_json, fatal=False) + + key = traverse_obj( + mediation_config, ('live', 'key')) or '795cd9c089a1fc48094524a5eba85a3fca1331817c802f601735907c8bbb4f50' + secret = traverse_obj( + mediation_config, ('live', 'secret')) or '64dec00a6989ba83d087621465b5e5d38bdac22033b0613b659c442c78976fa0' + + return base64.b64encode(f'{key}:{secret}'.encode()).decode() + + def _perform_login(self, email, password): + webpage = self._download_webpage('https://play.caracoltv.com/', None, fatal=False) + app_token = self._extract_app_token(webpage) + + bearer_token = self._download_json( + 'https://eu-gateway.inmobly.com/applications/oauth', None, data=b'', note='Retrieving bearer token', + headers={'Authorization': f'Basic {app_token}'})['token'] + + self._USER_TOKEN = self._download_json( + 'https://eu-gateway.inmobly.com/user/login', None, note='Performing login', headers={ + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {bearer_token}', + }, data=json.dumps({ + 'device_data': { + 'device_id': str(uuid.uuid4()), + 'device_token': '', + 'device_type': 'web' + }, + 'login_data': { + 'enabled': True, + 'email': email, + 'password': password, + } + }).encode())['user_token'] + + def _extract_video(self, video_data, series_id=None, season_id=None, season_number=None): + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['stream_url'], series_id, 'mp4') + + return { + 'id': video_data['id'], + 'title': video_data.get('name'), + 'description': video_data.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': traverse_obj( + video_data, ('extra_thumbs', ..., {'url': 'thumb_url', 'height': 'height', 'width': 'width'})), + 'series_id': series_id, + 'season_id': season_id, + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(video_data.get('item_order')), + 'is_live': video_data.get('entry_type') == 3, + } + + def _extract_series_seasons(self, seasons, series_id): + for season in seasons: + api_response = self._download_json( + 'https://eu-gateway.inmobly.com/feed', series_id, query={'season_id': season['id']}, + headers={'Authorization': f'Bearer {self._USER_TOKEN}'}) + + season_number = season.get('order') + for episode in api_response['items']: + yield self._extract_video(episode, series_id, season['id'], season_number) + + def _real_extract(self, url): + series_id = self._match_id(url) + + if self._USER_TOKEN is None: + self._perform_login('guest@inmobly.com', 'Test@gus1') + + api_response = self._download_json( + 'https://eu-gateway.inmobly.com/feed', series_id, query={'include_ids': series_id}, + headers={'Authorization': f'Bearer {self._USER_TOKEN}'})['items'][0] + + if not api_response.get('seasons'): + return self._extract_video(api_response) + + return self.playlist_result( + self._extract_series_seasons(api_response['seasons'], series_id), + series_id, **traverse_obj(api_response, { + 'title': 'name', + 'description': 'description', + })) diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index d5c9aab8a3..b8cb5a691c 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -106,8 +106,12 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - if 'Debido a tu ubicación no puedes ver el contenido' in webpage: - self.raise_geo_restricted() + for message in [ + 'Debido a tu ubicación no puedes ver el contenido', + 'You are not allowed to watch this video: Geo Fencing Restriction' + ]: + if message in webpage: + self.raise_geo_restricted() player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id) From 21f40e75dfc0055ea9cdbd7fe2c46c6f9b561afd Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 21 Sep 2023 13:34:35 -0400 Subject: [PATCH 405/501] [ie/douyutv] Fix extractors (#7652) Closes #2494, Closes #7295 Authored by: c-basalt --- yt_dlp/extractor/douyutv.py | 273 ++++++++++++++++++++++++------------ 1 file changed, 184 insertions(+), 89 deletions(-) diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index fa40844df5..ee8893d5af 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -1,31 +1,72 @@ import time import hashlib -import re import urllib +import uuid from .common import InfoExtractor +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, + UserNotLive, + determine_ext, + int_or_none, + js_to_json, + parse_resolution, + str_or_none, + traverse_obj, unescapeHTML, - unified_strdate, + url_or_none, + urlencode_postdata, urljoin, ) -class DouyuTVIE(InfoExtractor): - IE_DESC = '斗鱼' +class DouyuBaseIE(InfoExtractor): + def _download_cryptojs_md5(self, video_id): + for url in [ + 'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js', + 'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js', + ]: + js_code = self._download_webpage( + url, video_id, note='Downloading signing dependency', fatal=False) + if js_code: + self.cache.store('douyu', 'crypto-js-md5', js_code) + return js_code + raise ExtractorError('Unable to download JS dependency (crypto-js/md5)') + + def _get_cryptojs_md5(self, video_id): + return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id) + + def _calc_sign(self, sign_func, video_id, a): + b = uuid.uuid4().hex + c = round(time.time()) + js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))' + phantom = PhantomJSwrapper(self) + result = phantom.execute(js_script, video_id, + note='Executing JS signing script').strip() + return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()} + + def _search_js_sign_func(self, webpage, fatal=True): + # The greedy look-behind ensures last possible script tag is matched + return self._search_regex( + r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal) + + +class DouyuTVIE(DouyuBaseIE): + IE_DESC = '斗鱼直播' _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)' _TESTS = [{ - 'url': 'http://www.douyutv.com/iseven', + 'url': 'https://www.douyu.com/pigff', 'info_dict': { - 'id': '17732', - 'display_id': 'iseven', - 'ext': 'flv', - 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': r're:.*m7show@163\.com.*', - 'thumbnail': r're:^https?://.*\.png', - 'uploader': '7师傅', + 'id': '24422', + 'display_id': 'pigff', + 'ext': 'mp4', + 'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群', + 'thumbnail': str, + 'uploader': 'pigff', 'is_live': True, + 'live_status': 'is_live', }, 'params': { 'skip_download': True, @@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] + def _get_sign_func(self, room_id, video_id): + return self._download_json( + f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id, + note='Getting signing script')['data'][f'room{room_id}'] + + def _extract_stream_formats(self, stream_formats): + formats = [] + for stream_info in traverse_obj(stream_formats, (..., 'data')): + stream_url = urljoin( + traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live')) + if stream_url: + rate_id = traverse_obj(stream_info, ('rate', {int_or_none})) + rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False) + ext = determine_ext(stream_url) + formats.append({ + 'url': stream_url, + 'format_id': str_or_none(rate_id), + 'ext': 'mp4' if ext == 'm3u8' else ext, + 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', + 'quality': rate_id % -10000 if rate_id is not None else None, + **traverse_obj(rate_info, { + 'format': ('name', {str_or_none}), + 'tbr': ('bit', {int_or_none}), + }), + }) + return formats + def _real_extract(self, url): video_id = self._match_id(url) - if video_id.isdigit(): - room_id = video_id - else: - page = self._download_webpage(url, video_id) - room_id = self._html_search_regex( - r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + webpage = self._download_webpage(url, video_id) + room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id') + + if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1': + raise UserNotLive('The channel is auto-playing VODs', video_id=video_id) + if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2': + raise UserNotLive(video_id=video_id) # Grab metadata from API params = { @@ -102,110 +171,136 @@ def _real_extract(self, url): 'time': int(time.time()), } params['auth'] = hashlib.md5( - f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() - room = self._download_json( + f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() + room = traverse_obj(self._download_json( f'http://www.douyutv.com/api/v1/room/{room_id}', video_id, - note='Downloading room info', query=params)['data'] + note='Downloading room info', query=params, fatal=False), 'data') # 1 = live, 2 = offline - if room.get('show_status') == '2': - raise ExtractorError('Live stream is offline', expected=True) + if traverse_obj(room, 'show_status') == '2': + raise UserNotLive(video_id=video_id) - video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL')) - formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id) + js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id) + form_data = { + 'rate': 0, + **self._calc_sign(js_sign_func, video_id, room_id), + } + stream_formats = [self._download_json( + f'https://www.douyu.com/lapi/live/getH5Play/{room_id}', + video_id, note="Downloading livestream format", + data=urlencode_postdata(form_data))] - title = unescapeHTML(room['room_name']) - description = room.get('show_details') - thumbnail = room.get('room_src') - uploader = room.get('nickname') + for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')): + if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')): + form_data['rate'] = rate_id + stream_formats.append(self._download_json( + f'https://www.douyu.com/lapi/live/getH5Play/{room_id}', + video_id, note=f'Downloading livestream format {rate_id}', + data=urlencode_postdata(form_data))) return { 'id': room_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, + 'formats': self._extract_stream_formats(stream_formats), 'is_live': True, - 'subtitles': subs, - 'formats': formats, + **traverse_obj(room, { + 'display_id': ('url', {str}, {lambda i: i[1:]}), + 'title': ('room_name', {unescapeHTML}), + 'description': ('show_details', {str}), + 'uploader': ('nickname', {str}), + 'thumbnail': ('room_src', {url_or_none}), + }) } -class DouyuShowIE(InfoExtractor): +class DouyuShowIE(DouyuBaseIE): _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ - 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', - 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY', 'info_dict': { - 'id': 'rjNBdvnVXNzvE2yw', + 'id': 'mPyq7oVNe5Yv1gLY', 'ext': 'mp4', - 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', - 'duration': 7150.08, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '陈一发儿', - 'uploader_id': 'XrZwYelr5wbK', - 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', - 'upload_date': '20170402', + 'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃', + 'duration': 633, + 'thumbnail': str, + 'uploader': '美食作家王刚V', + 'uploader_id': 'OVAO4NVx1m7Q', + 'timestamp': 1661850002, + 'upload_date': '20220830', + 'view_count': int, + 'tags': ['美食', '美食综合'], }, }, { 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', 'only_matching': True, }] + _FORMATS = { + 'super': '原画', + 'high': '超清', + 'normal': '高清', + } + + _QUALITIES = { + 'super': -1, + 'high': -2, + 'normal': -3, + } + + _RESOLUTIONS = { + 'super': '1920x1080', + 'high': '1280x720', + 'normal': '852x480', + } + def _real_extract(self, url): url = url.replace('vmobile.', 'v.') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - room_info = self._parse_json(self._search_regex( - r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + video_info = self._search_json( + r'<script>\s*window\.\$DATA\s*=', webpage, + 'video info', video_id, transform_source=js_to_json) - video_info = None + js_sign_func = self._search_js_sign_func(webpage) + form_data = { + 'vid': video_id, + **self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']), + } + url_info = self._download_json( + 'https://v.douyu.com/api/stream/getStreamUrl', video_id, + data=urlencode_postdata(form_data), note="Downloading video formats") - for trial in range(5): - # Sometimes Douyu rejects our request. Let's try it more times - try: - video_info = self._download_json( - 'https://vmobile.douyu.com/video/getInfo', video_id, - query={'vid': video_id}, - headers={ - 'Referer': url, - 'x-requested-with': 'XMLHttpRequest', - }) - break - except ExtractorError: - self._sleep(1, video_id) - - if not video_info: - raise ExtractorError('Can\'t fetch video info') - - formats = self._extract_m3u8_formats( - video_info['data']['video_url'], video_id, - entry_protocol='m3u8_native', ext='mp4') - - upload_date = unified_strdate(self._html_search_regex( - r'<em>上传时间:</em><span>([^<]+)</span>', webpage, - 'upload date', fatal=False)) - - uploader = uploader_id = uploader_url = None - mobj = re.search( - r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', - webpage) - if mobj: - uploader_id, uploader = mobj.groups() - uploader_url = urljoin(url, '/author/' + uploader_id) + formats = [] + for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)): + video_url = traverse_obj(url, ('url', {url_or_none})) + if video_url: + ext = determine_ext(video_url) + formats.append({ + 'format': self._FORMATS.get(name), + 'format_id': name, + 'url': video_url, + 'quality': self._QUALITIES.get(name), + 'ext': 'mp4' if ext == 'm3u8' else ext, + 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', + **parse_resolution(self._RESOLUTIONS.get(name)) + }) + else: + self.to_screen( + f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}') return { 'id': video_id, - 'title': room_info['name'], 'formats': formats, - 'duration': room_info.get('duration'), - 'thumbnail': room_info.get('pic'), - 'upload_date': upload_date, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, + **traverse_obj(video_info, ('DATA', { + 'title': ('content', 'title', {str}), + 'uploader': ('content', 'author', {str}), + 'uploader_id': ('content', 'up_id', {str_or_none}), + 'duration': ('content', 'video_duration', {int_or_none}), + 'thumbnail': ('content', 'video_pic', {url_or_none}), + 'timestamp': ('content', 'create_time', {int_or_none}), + 'view_count': ('content', 'view_num', {int_or_none}), + 'tags': ('videoTag', ..., 'tagName', {str}), + })) } From 5fccabac27ca3c1165ade1b0df6fbadc24258dc2 Mon Sep 17 00:00:00 2001 From: Simon <simon30002021@icloud.com> Date: Thu, 21 Sep 2023 19:37:58 +0200 Subject: [PATCH 406/501] [ie/rbgtum] Fix extraction and support new URL format (#7690) Authored by: simon300000 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rbgtum.py | 79 ++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 632d6720e1..9cda06d8fa 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1601,6 +1601,7 @@ from .rbgtum import ( RbgTumIE, RbgTumCourseIE, + RbgTumNewCourseIE, ) from .rcs import ( RCSIE, diff --git a/yt_dlp/extractor/rbgtum.py b/yt_dlp/extractor/rbgtum.py index 47649cfc58..c8a331f3ee 100644 --- a/yt_dlp/extractor/rbgtum.py +++ b/yt_dlp/extractor/rbgtum.py @@ -1,10 +1,11 @@ import re from .common import InfoExtractor +from ..utils import parse_qs, remove_start, traverse_obj, ExtractorError class RbgTumIE(InfoExtractor): - _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)' + _VALID_URL = r'https://(?:live\.rbg\.tum\.de|tum\.live)/w/(?P<id>[^?#]+)' _TESTS = [{ # Combined view 'url': 'https://live.rbg.tum.de/w/cpp/22128', @@ -35,16 +36,18 @@ class RbgTumIE(InfoExtractor): 'title': 'Fachschaftsvollversammlung', 'series': 'Fachschaftsvollversammlung Informatik', } + }, { + 'url': 'https://tum.live/w/linalginfo/27102', + 'only_matching': True, }, ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') - lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') - lecture_series_title = self._html_search_regex( - r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + m3u8 = self._html_search_regex(r'"(https://[^"]+\.m3u8[^"]*)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r']*>([^<]+)', webpage, 'title', fatal=False) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') @@ -57,9 +60,9 @@ def _real_extract(self, url): class RbgTumCourseIE(InfoExtractor): - _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _VALID_URL = r'https://(?P(?:live\.rbg\.tum\.de|tum\.live))/old/course/(?P(?P\d+)/(?P\w+)/(?P[^/?#]+))' _TESTS = [{ - 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'url': 'https://live.rbg.tum.de/old/course/2022/S/fpv', 'info_dict': { 'title': 'Funktionale Programmierung und Verifikation (IN0003)', 'id': '2022/S/fpv', @@ -69,7 +72,7 @@ class RbgTumCourseIE(InfoExtractor): }, 'playlist_count': 13, }, { - 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'url': 'https://live.rbg.tum.de/old/course/2022/W/set', 'info_dict': { 'title': 'SET FSMPIC', 'id': '2022/W/set', @@ -78,16 +81,62 @@ class RbgTumCourseIE(InfoExtractor): 'noplaylist': False, }, 'playlist_count': 6, + }, { + 'url': 'https://tum.live/old/course/2023/S/linalginfo', + 'only_matching': True, }, ] def _real_extract(self, url): - course_id = self._match_id(url) - webpage = self._download_webpage(url, course_id) + course_id, hostname, year, term, slug = self._match_valid_url(url).group('id', 'hostname', 'year', 'term', 'slug') + meta = self._download_json( + f'https://{hostname}/api/courses/{slug}/', course_id, fatal=False, + query={'year': year, 'term': term}) or {} + lecture_series_title = meta.get('Name') + lectures = [self.url_result(f'https://{hostname}/w/{slug}/{stream_id}', RbgTumIE) + for stream_id in traverse_obj(meta, ('Streams', ..., 'ID'))] - lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + if not lectures: + webpage = self._download_webpage(url, course_id) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') + lectures = [self.url_result(f'https://{hostname}{lecture_path}', RbgTumIE) + for lecture_path in re.findall(r'href="(/w/[^/"]+/[^/"]+)"', webpage)] - lecture_urls = [] - for lecture_url in re.findall(r'(?i)href="/w/(.+)(?(?:live\.rbg\.tum\.de|tum\.live))/\?' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/?year=2022&term=S&slug=fpv&view=3', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/?year=2022&term=W&slug=set&view=3', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, { + 'url': 'https://tum.live/?year=2023&term=S&slug=linalginfo&view=3', + 'only_matching': True, + }] + + def _real_extract(self, url): + query = parse_qs(url) + errors = [key for key in ('year', 'term', 'slug') if not query.get(key)] + if errors: + raise ExtractorError(f'Input URL is missing query parameters: {", ".join(errors)}') + year, term, slug = query['year'][0], query['term'][0], query['slug'][0] + hostname = self._match_valid_url(url).group('hostname') + + return self.url_result(f'https://{hostname}/old/course/{year}/{term}/{slug}', RbgTumCourseIE) From b84fda7388dd20d38921e23b469147f3957c1812 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Thu, 21 Sep 2023 17:45:18 +0000 Subject: [PATCH 407/501] [ie/bilibili] Extract Dolby audio formats (#8142) Closes #4050 Authored by: ClosedPort22 --- yt_dlp/extractor/bilibili.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 5e7042dbbd..9119f396be 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -49,14 +49,14 @@ def extract_formats(self, play_info): for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } - audios = traverse_obj(play_info, ('dash', 'audio', ...)) + audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict})) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), - 'acodec': audio.get('codecs'), + 'acodec': traverse_obj(audio, ('codecs', {str.lower})), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'filesize': int_or_none(audio.get('size')), @@ -71,6 +71,7 @@ def extract_formats(self, play_info): 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, + 'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))), 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), From a5e264d74b4bd60c6e7ec4e38f1a23af4e420531 Mon Sep 17 00:00:00 2001 From: kylegustavo Date: Thu, 21 Sep 2023 10:46:49 -0700 Subject: [PATCH 408/501] [ie/Expressen] Improve `_VALID_URL` (#8153) Closes #8141 Authored by: kylegustavo --- yt_dlp/extractor/expressen.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/expressen.py b/yt_dlp/extractor/expressen.py index 86967b631b..b96f2e4cbb 100644 --- a/yt_dlp/extractor/expressen.py +++ b/yt_dlp/extractor/expressen.py @@ -11,8 +11,8 @@ class ExpressenIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)?(?:expressen|di)\.se/ - (?:(?:tvspelare/video|videoplayer/embed)/)? - tv/(?:[^/]+/)* + (?:(?:tvspelare/video|video-?player/embed)/)? + (?:tv|nyheter)/(?:[^/?#]+/)* (?P[^/?#&]+) ''' _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] @@ -42,6 +42,12 @@ class ExpressenIE(InfoExtractor): }, { 'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/video-player/embed/tv/nyheter/ekero-fodda-olof-gustafsson-forvaltar-knarkbaronen-pablo-escobars-namn', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/nyheter/efter-egna-telefonbluffen-escobar-stammer-klarna/', + 'only_matching': True, }] def _real_extract(self, url): From 2269065ad60cb0ab62408ae6a7b20283e5252232 Mon Sep 17 00:00:00 2001 From: std-move <26625259+std-move@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:19:52 +0200 Subject: [PATCH 409/501] [ie/NovaEmbed] Fix extractor (#7910) Closes #8025 Authored by: std-move --- yt_dlp/extractor/nova.py | 116 +++++++++++++++------------------------ 1 file changed, 45 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index 8bd3fd4725..bd0c4ebe34 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -6,7 +6,6 @@ determine_ext, int_or_none, js_to_json, - qualities, traverse_obj, unified_strdate, url_or_none, @@ -49,77 +48,52 @@ def _real_extract(self, url): duration = None formats = [] - player = self._parse_json( - self._search_regex( - (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P{.*?})\s*\)(?:\s*\))?\s*,', - r'Player\.init\s*\([^,]+,(?P\s*\w+\s*\?)?\s*(?P{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), - webpage, 'player', default='{}', group='json'), video_id, fatal=False) - if player: - for format_id, format_list in player['tracks'].items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_dict in format_list: - if not isinstance(format_dict, dict): - continue - if (not self.get_param('allow_unplayable_formats') - and traverse_obj(format_dict, ('drm', 'keySystem'))): - has_drm = True - continue - format_url = url_or_none(format_dict.get('src')) - format_type = format_dict.get('type') - ext = determine_ext(format_url) - if (format_type == 'application/x-mpegURL' - or format_id == 'HLS' or ext == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - elif (format_type == 'application/dash+xml' - or format_id == 'DASH' or ext == 'mpd'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - }) - duration = int_or_none(player.get('duration')) - else: - # Old path, not actual as of 08.04.2020 - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { + def process_format_list(format_list, format_id=""): + nonlocal formats, has_drm + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), - }) - break - f['format_id'] = f_id - formats.append(f) + }) + + player = self._search_json( + r'player:', webpage, 'player', video_id, fatal=False, end_pattern=r';\s*') + if player: + for src in traverse_obj(player, ('lib', 'source', 'sources', ...)): + process_format_list(src) + duration = traverse_obj(player, ('sourceInfo', 'duration', {int_or_none})) + if not formats and not has_drm: + # older code path, in use before August 2023 + player = self._parse_json( + self._search_regex( + (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P{.*?})\s*\)(?:\s*\))?\s*,', + r'Player\.init\s*\([^,]+,(?P\s*\w+\s*\?)?\s*(?P{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), + webpage, 'player', group='json'), video_id) + if player: + for format_id, format_list in player['tracks'].items(): + process_format_list(format_list, format_id) + duration = int_or_none(player.get('duration')) if not formats and has_drm: self.report_drm(video_id) From 52414d64ca7b92d3f83964cdd68247989b0c4625 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Sep 2023 16:51:57 -0500 Subject: [PATCH 410/501] [utils] `js_to_json`: Handle `Array` objects Authored by: Grub4K, std-move Co-authored-by: std-move <26625259+std-move@users.noreply.github.com> Co-authored-by: Simon Sawicki --- test/test_utils.py | 6 ++++++ yt_dlp/utils/_utils.py | 1 + 2 files changed, 7 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 91e3ffd39e..47d1f71bfe 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1218,6 +1218,12 @@ def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') self.assertEqual(js_to_json('`${name}`', {}), '"name"') + def test_js_to_json_map_array_constructors(self): + self.assertEqual(json.loads(js_to_json('new Map([["a", 5]])')), {'a': 5}) + self.assertEqual(json.loads(js_to_json('Array(5, 10)')), [5, 10]) + self.assertEqual(json.loads(js_to_json('new Array(15,5)')), [15, 5]) + self.assertEqual(json.loads(js_to_json('new Map([Array(5, 10),new Array(15,5)])')), {'5': 10, '15': 5}) + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index ef26de1160..213ccc6363 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2727,6 +2727,7 @@ def fix_kv(m): def create_map(mobj): return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) + code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) if not strict: code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) From 904a19ee93195ce0bd4b08bd22b186120afb5b17 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Sep 2023 16:54:57 -0500 Subject: [PATCH 411/501] [ie] Make `_search_nuxt_data` more lenient Authored by: std-move Co-authored-by: std-move <26625259+std-move@users.noreply.github.com> --- yt_dlp/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 7deab995c4..c94b4abdc2 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1687,7 +1687,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' + FUNCTION_RE = r'\(function\((?P.*?)\){(?:.*?)return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), From 568f08051841aedea968258889539741e26009e9 Mon Sep 17 00:00:00 2001 From: std-move <26625259+std-move@users.noreply.github.com> Date: Fri, 22 Sep 2023 00:20:52 +0200 Subject: [PATCH 412/501] [ie/iprima] Fix extractor (#7216) Closes #7229 Authored by: std-move --- yt_dlp/extractor/iprima.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 6dec1510da..f7aa579b38 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -134,10 +134,17 @@ def _real_extract(self, url): ), webpage, 'real id', group='id', default=None) if not video_id: - nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data') + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data', fatal=False) video_id = traverse_obj( nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False) + if not video_id: + nuxt_data = self._search_json( + r']+\bid=["\']__NUXT_DATA__["\'][^>]*>', + webpage, 'nuxt data', None, end_pattern=r'', contains_pattern=r'\[(?s:.+)\]') + + video_id = traverse_obj(nuxt_data, lambda _, v: re.fullmatch(r'p\d+', v), get_all=False) + if not video_id: self.raise_no_formats('Unable to extract video ID from webpage') From 661c9a1d029296b28e0b2f8be8a72a43abaf6536 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Sep 2023 17:48:57 -0500 Subject: [PATCH 413/501] [test:download] Test for `expected_exception` Authored by: at-wat Co-authored-by: Atsushi Watanabe --- test/test_download.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_download.py b/test/test_download.py index 6f00a4deda..2530792493 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -31,6 +31,7 @@ DownloadError, ExtractorError, UnavailableVideoError, + YoutubeDLError, format_bytes, join_nonempty, ) @@ -100,6 +101,8 @@ def print_skipping(reason): print_skipping('IE marked as not _WORKING') for tc in test_cases: + if tc.get('expected_exception'): + continue info_dict = tc.get('info_dict', {}) params = tc.get('params', {}) if not info_dict.get('id'): @@ -139,6 +142,17 @@ def get_tc_filename(tc): res_dict = None + def match_exception(err): + expected_exception = test_case.get('expected_exception') + if not expected_exception: + return False + if err.__class__.__name__ == expected_exception: + return True + for exc in err.exc_info: + if exc.__class__.__name__ == expected_exception: + return True + return False + def try_rm_tcs_files(tcs=None): if tcs is None: tcs = test_cases @@ -161,6 +175,8 @@ def try_rm_tcs_files(tcs=None): except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not isinstance(err.exc_info[1], (TransportError, UnavailableVideoError)) or (isinstance(err.exc_info[1], HTTPError) and err.exc_info[1].status == 503): + if match_exception(err): + return err.msg = f'{getattr(err, "msg", err)} ({tname})' raise @@ -171,6 +187,10 @@ def try_rm_tcs_files(tcs=None): print(f'Retrying: {try_num} failed tries\n\n##########\n\n') try_num += 1 + except YoutubeDLError as err: + if match_exception(err): + return + raise else: break From c1d71d0d9f41db5e4306c86af232f5f6220a130b Mon Sep 17 00:00:00 2001 From: Atsushi Watanabe Date: Fri, 22 Sep 2023 08:04:05 +0900 Subject: [PATCH 414/501] [ie/twitcasting] Support `--wait-for-video` (#7975) Authored by: at-wat --- yt_dlp/extractor/twitcasting.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 3890d5d8fb..540e217fd8 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -5,8 +5,9 @@ from .common import InfoExtractor from ..dependencies import websockets from ..utils import ( - clean_html, ExtractorError, + UserNotLive, + clean_html, float_or_none, get_element_by_class, get_element_by_id, @@ -235,6 +236,9 @@ class TwitCastingLiveIE(InfoExtractor): _TESTS = [{ 'url': 'https://twitcasting.tv/ivetesangalo', 'only_matching': True, + }, { + 'url': 'https://twitcasting.tv/c:unusedlive', + 'expected_exception': 'UserNotLive', }] def _real_extract(self, url): @@ -260,7 +264,7 @@ def _real_extract(self, url): r'(?s)\d+)"\s*>.+?', webpage, 'current live ID 2', default=None, group='video_id') if not current_live: - raise ExtractorError('The user is not currently live') + raise UserNotLive(video_id=uploader_id) return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live)) From c2da0b5ea215298135f76e3dc14b972a3c4afacb Mon Sep 17 00:00:00 2001 From: bashonly Date: Sat, 23 Sep 2023 14:54:00 -0500 Subject: [PATCH 415/501] [ie/ArteTV] Fix HLS formats extraction Closes #8156 Authored by: bashonly --- yt_dlp/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index e3cc5afb05..a19cd2a3ae 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -169,7 +169,7 @@ def _real_extract(self, url): ))) short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') - if stream['protocol'].startswith('HLS'): + if 'HLS' in stream['protocol']: fmts, subs = self._extract_m3u8_formats_and_subtitles( stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) for fmt in fmts: From 5ca095cbcde3e32642a4fe5b2d69e8e3c785a021 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 23 Sep 2023 15:00:31 -0500 Subject: [PATCH 416/501] [cleanup] Misc (#8182) Closes #7796, Closes #8028 Authored by: barsnick, sqrtNOT, gamer191, coletdjnz, Grub4K, bashonly --- CONTRIBUTING.md | 8 ++++---- README.md | 2 +- devscripts/make_changelog.py | 2 +- test/test_YoutubeDL.py | 1 - test/test_networking_utils.py | 6 +++--- yt_dlp/YoutubeDL.py | 6 +++--- yt_dlp/compat/urllib/__init__.py | 2 +- yt_dlp/extractor/abc.py | 1 - yt_dlp/extractor/ign.py | 4 ---- yt_dlp/extractor/nebula.py | 1 - yt_dlp/extractor/peekvids.py | 1 - yt_dlp/extractor/radiofrance.py | 2 +- yt_dlp/extractor/rcs.py | 6 +++--- yt_dlp/extractor/rokfin.py | 1 - yt_dlp/extractor/s4c.py | 2 -- yt_dlp/extractor/sovietscloset.py | 1 - yt_dlp/extractor/youtube.py | 2 +- yt_dlp/networking/__init__.py | 2 +- yt_dlp/networking/_urllib.py | 2 +- yt_dlp/networking/exceptions.py | 4 ++-- 20 files changed, 22 insertions(+), 34 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8587fe92d..90e7faf7c4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -217,7 +217,7 @@ ## Adding support for a new site 1. Add an import in [`yt_dlp/extractor/_extractors.py`](yt_dlp/extractor/_extractors.py). Note that the class name must end with `IE`. 1. Run `python test/test_download.py TestDownload.test_YourExtractor` (note that `YourExtractor` doesn't end with `IE`). This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, the tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. You can also run all the tests in one go with `TestDownload.test_YourExtractor_all` 1. Make sure you have atleast one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. -1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L91-L426). Add tests and code for as many as you want. +1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L119-L440). Add tests and code for as many as you want. 1. Make sure your code follows [yt-dlp coding conventions](#yt-dlp-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 yt_dlp/extractor/yourextractor.py @@ -251,7 +251,7 @@ ## yt-dlp coding conventions ### Mandatory and optional metafields -For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L91-L426) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: +For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L119-L440) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: - `id` (media identifier) - `title` (media title) @@ -696,7 +696,7 @@ #### Examples ### Use convenience conversion and parsing functions -Wrap all extracted numeric data into safe functions from [`yt_dlp/utils.py`](yt_dlp/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`yt_dlp/utils/`](yt_dlp/utils/): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. Use `url_or_none` for safe URL processing. @@ -704,7 +704,7 @@ ### Use convenience conversion and parsing functions Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. -Explore [`yt_dlp/utils.py`](yt_dlp/utils.py) for more useful convenience functions. +Explore [`yt_dlp/utils/`](yt_dlp/utils/) for more useful convenience functions. #### Examples diff --git a/README.md b/README.md index d94d8ea822..d9b11952de 100644 --- a/README.md +++ b/README.md @@ -1800,7 +1800,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index ac68dcd19a..9ff65db146 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -260,7 +260,7 @@ class CommitRange: AUTHOR_INDICATOR_RE = re.compile(r'Authored by:? ', re.IGNORECASE) MESSAGE_RE = re.compile(r''' (?:\[(?P[^\]]+)\]\ )? - (?:(?P`?[^:`]+`?): )? + (?:(?P`?[\w.-]+`?): )? (?P.+?) (?:\ \((?P\#\d+(?:,\ \#\d+)*)\))? ''', re.VERBOSE | re.DOTALL) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3cfb61fb26..916ee48b97 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -631,7 +631,6 @@ def test_add_extra_info(self): self.assertEqual(test_dict['playlist'], 'funny videos') outtmpl_info = { - 'id': '1234', 'id': '1234', 'ext': 'mp4', 'width': None, diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index dbf656090d..419aae1e47 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -269,14 +269,14 @@ def test_compat_http_error_autoclose(self): assert not response.closed def test_incomplete_read_error(self): - error = IncompleteRead(b'test', 3, cause='test') + error = IncompleteRead(4, 3, cause='test') assert isinstance(error, IncompleteRead) assert repr(error) == '' assert str(error) == error.msg == '4 bytes read, 3 more expected' - assert error.partial == b'test' + assert error.partial == 4 assert error.expected == 3 assert error.cause == 'test' - error = IncompleteRead(b'aaa') + error = IncompleteRead(3) assert repr(error) == '' assert str(error) == '3 bytes read' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 1feed30524..39aaf2c2ed 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -239,9 +239,9 @@ class YoutubeDL: 'selected' (check selected formats), or None (check only if requested by extractor) paths: Dictionary of output paths. The allowed keys are 'home' - 'temp' and the keys of OUTTMPL_TYPES (in utils.py) + 'temp' and the keys of OUTTMPL_TYPES (in utils/_utils.py) outtmpl: Dictionary of templates for output names. Allowed keys - are 'default' and the keys of OUTTMPL_TYPES (in utils.py). + are 'default' and the keys of OUTTMPL_TYPES (in utils/_utils.py). For compatibility with youtube-dl, a single string can also be used outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names @@ -422,7 +422,7 @@ class YoutubeDL: asked whether to download the video. - Raise utils.DownloadCancelled(msg) to abort remaining downloads when a video is rejected. - match_filter_func in utils.py is one example for this. + match_filter_func in utils/_utils.py is one example for this. color: A Dictionary with output stream names as keys and their respective color policy as values. Can also just be a single color policy, diff --git a/yt_dlp/compat/urllib/__init__.py b/yt_dlp/compat/urllib/__init__.py index b27cc6133c..9084b3c2bf 100644 --- a/yt_dlp/compat/urllib/__init__.py +++ b/yt_dlp/compat/urllib/__init__.py @@ -1,7 +1,7 @@ # flake8: noqa: F405 from urllib import * # noqa: F403 -del request +del request # noqa: F821 from . import request # noqa: F401 from ..compat_utils import passthrough_module diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index f56133eb3e..d2cf5f7c51 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -180,7 +180,6 @@ class ABCIViewIE(InfoExtractor): _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] - # ABC iview programs are normally available for 14 days only. _TESTS = [{ 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', 'md5': '67715ce3c78426b11ba167d875ac6abf', diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py index 64875f8ceb..1c4f105e9b 100644 --- a/yt_dlp/extractor/ign.py +++ b/yt_dlp/extractor/ign.py @@ -197,10 +197,6 @@ class IGNVideoIE(IGNBaseIE): 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', 'duration': 298, 'tags': 'count:13', - 'display_id': '112203', - 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', - 'duration': 298, - 'tags': 'count:13', }, 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 4f3e691b71..8fba2bcf74 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -127,7 +127,6 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', 'uploader_id': 'lindsayellis', - 'timestamp': 1533009600, 'uploader_url': 'https://nebula.tv/lindsayellis', 'series': 'Lindsay Ellis', 'display_id': 'that-time-disney-remade-beauty-and-the-beast', diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index d1fc058b92..41f591b093 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -146,7 +146,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'uploader': 'Brazzers', 'age_limit': 18, 'view_count': int, - 'age_limit': 18, 'categories': list, 'tags': list, }, diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 35f4b91dd2..ec1b97631e 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -82,7 +82,7 @@ class RadioFranceBaseIE(InfoExtractor): def _extract_data_from_webpage(self, webpage, display_id, key): return traverse_obj(self._search_json( r'\bconst\s+data\s*=', webpage, key, display_id, - contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json), + contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json), (..., 'data', key, {dict}), get_all=False) or {} diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py index 028d3d90bb..b865f63fbd 100644 --- a/yt_dlp/extractor/rcs.py +++ b/yt_dlp/extractor/rcs.py @@ -239,10 +239,10 @@ class RCSEmbedsIE(RCSBaseIE): } }, { 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', - 'match_only': True + 'only_matching': True }, { 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', - 'match_only': True + 'only_matching': True }] _WEBPAGE_TESTS = [{ 'url': 'https://www.iodonna.it/video-iodonna/personaggi-video/monica-bellucci-piu-del-lavoro-oggi-per-me-sono-importanti-lamicizia-e-la-famiglia/', @@ -325,7 +325,7 @@ class RCSIE(RCSBaseIE): } }, { 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', - 'match_only': True + 'only_matching': True }] diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index 4a4d40befd..cad76f0c99 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -40,7 +40,6 @@ class RokfinIE(InfoExtractor): 'channel': 'Jimmy Dore', 'channel_id': 65429, 'channel_url': 'https://rokfin.com/TheJimmyDoreShow', - 'duration': 213.0, 'availability': 'public', 'live_status': 'not_live', 'dislike_count': int, diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py index 990ea2b447..67eff723b1 100644 --- a/yt_dlp/extractor/s4c.py +++ b/yt_dlp/extractor/s4c.py @@ -78,7 +78,6 @@ class S4CSeriesIE(InfoExtractor): 'info_dict': { 'id': '864982911', 'title': 'Iaith ar Daith', - 'description': 'md5:e878ebf660dce89bd2ef521d7ce06397' }, }, { 'url': 'https://www.s4c.cymru/clic/series/866852587', @@ -86,7 +85,6 @@ class S4CSeriesIE(InfoExtractor): 'info_dict': { 'id': '866852587', 'title': 'FFIT Cymru', - 'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96' }, }] diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index 453016ccb3..493eea2a69 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -76,7 +76,6 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', 'thumbnail': r're:^https?://.*\.b-cdn\.net/c0e5e76f-3a93-40b4-bf01-12343c2eec5d/thumbnail\.jpg$', - 'uploader': 'SovietWomble', 'creator': 'SovietWomble', 'release_timestamp': 1461157200, 'release_date': '20160420', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 023d8fd8c1..a39d17cf11 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -902,7 +902,7 @@ def extract_relative_time(relative_time_text): e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - # XXX: this could be moved to a general function in utils.py + # XXX: this could be moved to a general function in utils/_utils.py # The relative time text strings are roughly the same as what # Javascript's Intl.RelativeTimeFormat function generates. # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 5e88764844..5b1599a6dc 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -1,4 +1,4 @@ -# flake8: noqa: 401 +# flake8: noqa: F401 from .common import ( HEADRequest, PUTRequest, diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index c327f7744e..9e2bf33e45 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -337,7 +337,7 @@ def handle_sslerror(e: ssl.SSLError): def handle_response_read_exceptions(e): if isinstance(e, http.client.IncompleteRead): - raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e + raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e elif isinstance(e, ssl.SSLError): handle_sslerror(e) elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)): diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 465b18ba94..f58dc246e6 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -75,10 +75,10 @@ def __repr__(self): class IncompleteRead(TransportError): - def __init__(self, partial, expected=None, **kwargs): + def __init__(self, partial: int, expected: int = None, **kwargs): self.partial = partial self.expected = expected - msg = f'{len(partial)} bytes read' + msg = f'{partial} bytes read' if expected is not None: msg += f', {expected} more expected' From eaee21bf71889d495076037cbe590c8c0b21ef3a Mon Sep 17 00:00:00 2001 From: garret Date: Sat, 23 Sep 2023 23:13:48 +0100 Subject: [PATCH 417/501] [ie/Monstercat] Add extractor (#8133) Closes #8067 Authored by: garret1317 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/monstercat.py | 79 +++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 yt_dlp/extractor/monstercat.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9cda06d8fa..691cac339f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1126,6 +1126,7 @@ MofosexEmbedIE, ) from .mojvideo import MojvideoIE +from .monstercat import MonstercatIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, diff --git a/yt_dlp/extractor/monstercat.py b/yt_dlp/extractor/monstercat.py new file mode 100644 index 0000000000..7f04825fcd --- /dev/null +++ b/yt_dlp/extractor/monstercat.py @@ -0,0 +1,79 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + get_element_text_and_html_by_tag, + int_or_none, + unified_strdate, + strip_or_none, + traverse_obj, + try_call, +) + + +class MonstercatIE(InfoExtractor): + _VALID_URL = r'https://www\.monstercat\.com/release/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.monstercat.com/release/742779548009', + 'playlist_count': 20, + 'info_dict': { + 'title': 'The Secret Language of Trees', + 'id': '742779548009', + 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover', + 'release_year': 2023, + 'release_date': '20230711', + 'album': 'The Secret Language of Trees', + 'album_artist': 'BT', + } + }] + + def _extract_tracks(self, table, album_meta): + for td in re.findall(r'((?:(?!)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag + title = clean_html(try_call( + lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' Date: Sun, 24 Sep 2023 06:15:01 +0800 Subject: [PATCH 418/501] [ie/PIAULIZAPortal] Add extractor (#7903) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/piaulizaportal.py | 70 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 yt_dlp/extractor/piaulizaportal.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 691cac339f..49c35cf713 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1452,6 +1452,7 @@ from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .piapro import PiaproIE +from .piaulizaportal import PIAULIZAPortalIE from .picarto import ( PicartoIE, PicartoVodIE, diff --git a/yt_dlp/extractor/piaulizaportal.py b/yt_dlp/extractor/piaulizaportal.py new file mode 100644 index 0000000000..1eb6d92b72 --- /dev/null +++ b/yt_dlp/extractor/piaulizaportal.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + time_seconds, + traverse_obj, +) + + +class PIAULIZAPortalIE(InfoExtractor): + IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM' + _VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', + 'info_dict': { + 'id': '005f18b7-e810-5618-cb82-0987c5755d44', + 'title': 'プレゼンテーションプレイヤーのサンプル', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', + 'info_dict': { + 'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', + 'title': '【確認用】視聴サンプルページ(ULIZA)', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0))) + if expires and expires <= time_seconds(): + raise ExtractorError('The link is expired.', video_id=video_id, expected=True) + + webpage = self._download_webpage(url, video_id) + + player_data = self._download_webpage( + self._search_regex( + r'' _ANVATO_PREFIX = 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + _CLIENT_DATA = { + 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', + 'clientSecret': 'CZuvCL49d9OwfGsR', + 'deviceId': str(uuid.uuid4()), + 'deviceInfo': base64.b64encode(json.dumps({ + 'model': 'desktop', + 'version': 'Chrome', + 'osName': 'Windows', + 'osVersion': '10.0', + }, separators=(',', ':')).encode()).decode(), + 'networkType': 'other', + 'nflClaimGroupsToAdd': [], + 'nflClaimGroupsToRemove': [], + } + _ACCOUNT_INFO = {} + _API_KEY = None + + _TOKEN = None + _TOKEN_EXPIRY = 0 + + def _get_account_info(self, url, slug): + if not self._API_KEY: + webpage = self._download_webpage(url, slug, fatal=False) or '' + self._API_KEY = self._search_regex( + r'window\.gigyaApiKey\s*=\s*["\'](\w+)["\'];', webpage, 'API key', + fatal=False) or '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f' + + cookies = self._get_cookies('https://auth-id.nfl.com/') + login_token = traverse_obj(cookies, ( + (f'glt_{self._API_KEY}', lambda k, _: k.startswith('glt_')), {lambda x: x.value}), get_all=False) + if not login_token: + self.raise_login_required() + if 'ucid' not in cookies: + raise ExtractorError( + 'Required cookies for the auth-id.nfl.com domain were not found among passed cookies. ' + 'If using --cookies, these cookies must be exported along with .nfl.com cookies, ' + 'or else try using --cookies-from-browser instead', expected=True) + + account = self._download_json( + 'https://auth-id.nfl.com/accounts.getAccountInfo', slug, + note='Downloading account info', data=urlencode_postdata({ + 'include': 'profile,data', + 'lang': 'en', + 'APIKey': self._API_KEY, + 'sdk': 'js_latest', + 'login_token': login_token, + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': traverse_obj(cookies, ( + 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='15170'), + 'format': 'json', + }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + self._ACCOUNT_INFO = traverse_obj(account, { + 'signatureTimestamp': 'signatureTimestamp', + 'uid': 'UID', + 'uidSignature': 'UIDSignature', + }) + + if len(self._ACCOUNT_INFO) != 3: + raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) + + def _get_auth_token(self, url, slug): + if self._TOKEN and self._TOKEN_EXPIRY > int(time.time() + 30): + return + + if not self._ACCOUNT_INFO: + self._get_account_info(url, slug) + + token = self._download_json( + 'https://api.nfl.com/identity/v3/token%s' % ( + '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), + slug, headers={'Content-Type': 'application/json'}, note='Downloading access token', + data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) + + self._TOKEN = token['accessToken'] + self._TOKEN_EXPIRY = token['expiresIn'] + self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] + def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) item = video_config['playlist'][0] @@ -168,7 +247,7 @@ def _real_extract(self, url): class NFLPlusReplayIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:replay' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/[\w-]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/(?P[\w-]+)(?:/(?P\d+))?' _TESTS = [{ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', 'info_dict': { @@ -185,23 +264,92 @@ class NFLPlusReplayIE(NFLBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1', + 'playlist_count': 4, + 'info_dict': { + 'id': 'giants-at-vikings-2022-post-1', + }, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-patriots-2011-pre-4', + 'playlist_count': 2, + 'info_dict': { + 'id': 'giants-at-patriots-2011-pre-4', + }, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-patriots-2011-pre-4', + 'info_dict': { + 'id': '950701', + 'ext': 'mp4', + 'title': 'Giants @ Patriots', + 'description': 'Giants at Patriots on September 01, 2011', + 'uploader': 'NFL', + 'upload_date': '20210724', + 'timestamp': 1627085874, + 'duration': 1532, + 'categories': ['Game Highlights'], + 'tags': ['play-by-play'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + 'extractor_args': {'nflplusreplay': {'type': ['condensed_game']}}, + }, }] + _REPLAY_TYPES = { + 'full_game': 'Full Game', + 'full_game_spanish': 'Full Game - Spanish', + 'condensed_game': 'Condensed Game', + 'all_22': 'All-22', + } + def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + slug, video_id = self._match_valid_url(url).group('slug', 'id') + requested_types = self._configuration_arg('type', ['all']) + if 'all' in requested_types: + requested_types = list(self._REPLAY_TYPES.keys()) + requested_types = traverse_obj(self._REPLAY_TYPES, (None, requested_types)) + + if not video_id: + self._get_auth_token(url, slug) + headers = {'Authorization': f'Bearer {self._TOKEN}'} + game_id = self._download_json( + f'https://api.nfl.com/football/v2/games/externalId/slug/{slug}', slug, + 'Downloading game ID', query={'withExternalIds': 'true'}, headers=headers)['id'] + replays = self._download_json( + 'https://api.nfl.com/content/v1/videos/replays', slug, 'Downloading replays JSON', + query={'gameId': game_id}, headers=headers) + if len(requested_types) == 1: + video_id = traverse_obj(replays, ( + 'items', lambda _, v: v['subType'] == requested_types[0], 'mcpPlaybackId'), get_all=False) + + if video_id: + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + def entries(): + for replay in traverse_obj( + replays, ('items', lambda _, v: v['mcpPlaybackId'] and v['subType'] in requested_types) + ): + video_id = replay['mcpPlaybackId'] + yield self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + return self.playlist_result(entries(), slug) class NFLPlusEpisodeIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:episode' _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P[\w-]+)' _TESTS = [{ - 'note': 'premium content', + 'note': 'Subscription required', 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', 'info_dict': { 'id': '1576832', 'ext': 'mp4', - 'title': 'Kurt\'s QB Insider: Conference Championships', + 'title': 'Conference Championships', 'description': 'md5:944f7fab56f7a37430bf8473f5473857', 'uploader': 'NFL', 'upload_date': '20230127', @@ -214,85 +362,9 @@ class NFLPlusEpisodeIE(NFLBaseIE): 'params': {'skip_download': 'm3u8'}, }] - _CLIENT_DATA = { - 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', - 'clientSecret': 'CZuvCL49d9OwfGsR', - 'deviceId': str(uuid.uuid4()), - 'deviceInfo': base64.b64encode(json.dumps({ - 'model': 'desktop', - 'version': 'Chrome', - 'osName': 'Windows', - 'osVersion': '10.0', - }, separators=(',', ':')).encode()).decode(), - 'networkType': 'other', - 'nflClaimGroupsToAdd': [], - 'nflClaimGroupsToRemove': [], - } - _ACCOUNT_INFO = {} - _API_KEY = None - - _TOKEN = None - _TOKEN_EXPIRY = 0 - - def _get_account_info(self, url, video_id): - cookies = self._get_cookies('https://www.nfl.com/') - login_token = traverse_obj(cookies, ( - (f'glt_{self._API_KEY}', f'gig_loginToken_{self._API_KEY}', - lambda k, _: k.startswith('glt_') or k.startswith('gig_loginToken_')), - {lambda x: x.value}), get_all=False) - if not login_token: - self.raise_login_required() - - account = self._download_json( - 'https://auth-id.nfl.com/accounts.getAccountInfo', video_id, - note='Downloading account info', data=urlencode_postdata({ - 'include': 'profile,data', - 'lang': 'en', - 'APIKey': self._API_KEY, - 'sdk': 'js_latest', - 'login_token': login_token, - 'authMode': 'cookie', - 'pageURL': url, - 'sdkBuild': traverse_obj(cookies, ( - 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='13642'), - 'format': 'json', - }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - self._ACCOUNT_INFO = traverse_obj(account, { - 'signatureTimestamp': 'signatureTimestamp', - 'uid': 'UID', - 'uidSignature': 'UIDSignature', - }) - - if len(self._ACCOUNT_INFO) != 3: - raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) - - def _get_auth_token(self, url, video_id): - if not self._ACCOUNT_INFO: - self._get_account_info(url, video_id) - - token = self._download_json( - 'https://api.nfl.com/identity/v3/token%s' % ( - '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), - video_id, headers={'Content-Type': 'application/json'}, note='Downloading access token', - data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) - - self._TOKEN = token['accessToken'] - self._TOKEN_EXPIRY = token['expiresIn'] - self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] - def _real_extract(self, url): slug = self._match_id(url) - - if not self._API_KEY: - webpage = self._download_webpage(url, slug, fatal=False) or '' - self._API_KEY = self._search_regex( - r'window\.gigyaApiKey=["\'](\w+)["\'];', webpage, 'API key', - default='3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f') - - if not self._TOKEN or self._TOKEN_EXPIRY <= int(time.time()): - self._get_auth_token(url, slug) - + self._get_auth_token(url, slug) video_id = self._download_json( f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={ 'Authorization': f'Bearer {self._TOKEN}', From 61bdf15fc7400601c3da1aa7a43917310a5bf391 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 24 Sep 2023 02:24:47 +0200 Subject: [PATCH 423/501] [core] Raise minimum recommended Python version to 3.8 (#8183) Authored by: Grub4K --- devscripts/changelog_override.json | 5 +++++ test/test_execution.py | 3 +++ yt_dlp/YoutubeDL.py | 16 ++++------------ yt_dlp/update.py | 25 +++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index e7f453acf8..9dfbf510f7 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -88,5 +88,10 @@ "when": "59e92b1f1833440bb2190f847eb735cf0f90bc85", "short": "[rh:urllib] Simplify gzip decoding (#7611)", "authors": ["Grub4K"] + }, + { + "action": "add", + "when": "c1d71d0d9f41db5e4306c86af232f5f6220a130b", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.8**\nSince Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803)" } ] diff --git a/test/test_execution.py b/test/test_execution.py index 7a9e800b66..fb2f6e2e9c 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -45,6 +45,9 @@ def test_lazy_extractors(self): self.assertTrue(os.path.exists(LAZY_EXTRACTORS)) _, stderr = self.run_yt_dlp(opts=('-s', 'test:')) + # `MIN_RECOMMENDED` emits a deprecated feature warning for deprecated python versions + if stderr and stderr.startswith('Deprecated Feature: Support for Python'): + stderr = '' self.assertFalse(stderr) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=subprocess.DEVNULL) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 39aaf2c2ed..f322b12a22 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -60,7 +60,7 @@ get_postprocessor, ) from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping -from .update import REPOSITORY, current_git_head, detect_variant +from .update import REPOSITORY, _get_system_deprecation, current_git_head, detect_variant from .utils import ( DEFAULT_OUTTMPL, IDENTITY, @@ -640,17 +640,9 @@ def process_color_policy(stream): for name, stream in self._out_files.items_ if name != 'console' }) - # The code is left like this to be reused for future deprecations - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 7) - current_version = sys.version_info[:2] - if current_version < MIN_RECOMMENDED: - msg = ('Support for Python version %d.%d has been deprecated. ' - 'See https://github.com/yt-dlp/yt-dlp/issues/3764 for more details.' - '\n You will no longer receive updates on this version') - if current_version < MIN_SUPPORTED: - msg = 'Python version %d.%d is no longer supported' - self.deprecated_feature( - f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED)) + system_deprecation = _get_system_deprecation() + if system_deprecation: + self.deprecated_feature(system_deprecation.replace('\n', '\n ')) if self.params.get('allow_unplayable_formats'): self.report_warning( diff --git a/yt_dlp/update.py b/yt_dlp/update.py index d708b09e35..db79df1271 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -112,6 +112,31 @@ def is_non_updateable(): detect_variant(), _NON_UPDATEABLE_REASONS['unknown' if VARIANT else 'other']) +def _get_system_deprecation(): + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 8) + + if sys.version_info > MIN_RECOMMENDED: + return None + + major, minor = sys.version_info[:2] + if sys.version_info < MIN_SUPPORTED: + msg = f'Python version {major}.{minor} is no longer supported' + else: + msg = f'Support for Python version {major}.{minor} has been deprecated. ' + # Temporary until `win_x86_exe` uses 3.8, which will deprecate Vista and Server 2008 + if detect_variant() == 'win_x86_exe': + platform_name = platform.platform() + if any(platform_name.startswith(f'Windows-{name}') for name in ('Vista', '2008Server')): + msg = 'Support for Windows Vista/Server 2008 has been deprecated. ' + else: + return None + msg += ('See https://github.com/yt-dlp/yt-dlp/issues/7803 for details.' + '\nYou may stop receiving updates on this version at any time') + + major, minor = MIN_RECOMMENDED + return f'{msg}! Please update to Python {major}.{minor} or above' + + def _sha256_file(path): h = hashlib.sha256() mv = memoryview(bytearray(128 * 1024)) From de015e930747165dbb8fcd360f8775fd973b7d6e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 24 Sep 2023 02:29:01 +0200 Subject: [PATCH 424/501] [core] Prevent RCE when using `--exec` with `%q` (CVE-2023-40581) The shell escape function is now using `""` instead of `\"`. `utils.Popen` has been patched to properly quote commands. Prior to this fix using `--exec` together with `%q` when on Windows could cause remote code to execute. See https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg for reference. Authored by: Grub4K --- devscripts/changelog_override.json | 5 +++++ test/test_YoutubeDL.py | 6 +++--- test/test_utils.py | 16 ++++++++++++++++ yt_dlp/compat/__init__.py | 2 +- yt_dlp/postprocessor/exec.py | 12 +++++------- yt_dlp/utils/_utils.py | 18 ++++++++++++++++-- 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 9dfbf510f7..fe0c82c66b 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -93,5 +93,10 @@ "action": "add", "when": "c1d71d0d9f41db5e4306c86af232f5f6220a130b", "short": "[priority] **The minimum *recommended* Python version has been raised to 3.8**\nSince Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803)" + }, + { + "action": "add", + "when": "61bdf15fc7400601c3da1aa7a43917310a5bf391", + "short": "[priority] Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg)\n - The shell escape function is now using `\"\"` instead of `\\\"`.\n - `utils.Popen` has been patched to properly quote commands." } ] diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 916ee48b97..0cf130db03 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -784,9 +784,9 @@ def expect_same_infodict(out): test('%(title4)#S', 'foo_bar_test') test('%(title4).10S', ('foo "bar" ', 'foo "bar"' + ('#' if compat_os_name == 'nt' else ' '))) if compat_os_name == 'nt': - test('%(title4)q', ('"foo \\"bar\\" test"', ""foo ⧹"bar⧹" test"")) - test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', '"id 1" "id 2" "id 3"')) - test('%(formats.0.id)#q', ('"id 1"', '"id 1"')) + test('%(title4)q', ('"foo ""bar"" test"', None)) + test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', None)) + test('%(formats.0.id)#q', ('"id 1"', None)) else: test('%(title4)q', ('\'foo "bar" test\'', '\'foo "bar" test\'')) test('%(formats.:.id)#q', "'id 1' 'id 2' 'id 3'") diff --git a/test/test_utils.py b/test/test_utils.py index 47d1f71bfe..dc2d8ce12b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -14,6 +14,7 @@ import io import itertools import json +import subprocess import xml.etree.ElementTree from yt_dlp.compat import ( @@ -28,6 +29,7 @@ InAdvancePagedList, LazyList, OnDemandPagedList, + Popen, age_restricted, args_to_str, base_url, @@ -2388,6 +2390,20 @@ def test_extract_basic_auth(self): assert extract_basic_auth('http://user:@foo.bar') == ('http://foo.bar', 'Basic dXNlcjo=') assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') + @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') + def test_Popen_windows_escaping(self): + def run_shell(args): + stdout, stderr, error = Popen.run( + args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + assert not stderr + assert not error + return stdout + + # Test escaping + assert run_shell(['echo', 'test"&']) == '"test""&"\n' + # Test if delayed expansion is disabled + assert run_shell(['echo', '^!']) == '"^!"\n' + assert run_shell('echo "^!"') == '"^!"\n' if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index 832a9138d3..5ad5c70ecf 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -30,7 +30,7 @@ def compat_etree_fromstring(text): if compat_os_name == 'nt': def compat_shlex_quote(s): import re - return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') + return s if re.match(r'^[-_\w./]+$', s) else s.replace('"', '""').join('""') else: from shlex import quote as compat_shlex_quote # noqa: F401 diff --git a/yt_dlp/postprocessor/exec.py b/yt_dlp/postprocessor/exec.py index cfc83167ce..c2e73fbabd 100644 --- a/yt_dlp/postprocessor/exec.py +++ b/yt_dlp/postprocessor/exec.py @@ -1,8 +1,6 @@ -import subprocess - from .common import PostProcessor from ..compat import compat_shlex_quote -from ..utils import PostProcessingError, encodeArgument, variadic +from ..utils import Popen, PostProcessingError, variadic class ExecPP(PostProcessor): @@ -27,10 +25,10 @@ def parse_cmd(self, cmd, info): def run(self, info): for tmpl in self.exec_cmd: cmd = self.parse_cmd(tmpl, info) - self.to_screen('Executing command: %s' % cmd) - retCode = subprocess.call(encodeArgument(cmd), shell=True) - if retCode != 0: - raise PostProcessingError('Command returned error code %d' % retCode) + self.to_screen(f'Executing command: {cmd}') + _, _, return_code = Popen.run(cmd, shell=True) + if return_code != 0: + raise PostProcessingError(f'Command returned error code {return_code}') return [], info diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 213ccc6363..ba62423806 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -825,7 +825,7 @@ def _fix(key): _fix('LD_LIBRARY_PATH') # Linux _fix('DYLD_LIBRARY_PATH') # macOS - def __init__(self, *args, env=None, text=False, **kwargs): + def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs): if env is None: env = os.environ.copy() self._fix_pyinstaller_ld_path(env) @@ -835,7 +835,21 @@ def __init__(self, *args, env=None, text=False, **kwargs): kwargs['universal_newlines'] = True # For 3.6 compatibility kwargs.setdefault('encoding', 'utf-8') kwargs.setdefault('errors', 'replace') - super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo) + + if shell and compat_os_name == 'nt' and kwargs.get('executable') is None: + if not isinstance(args, str): + args = ' '.join(compat_shlex_quote(a) for a in args) + shell = False + args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"' + + super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo) + + def __comspec(self): + comspec = os.environ.get('ComSpec') or os.path.join( + os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe') + if os.path.isabs(comspec): + return comspec + raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set') def communicate_or_kill(self, *args, **kwargs): try: From 088add9567d39b758737e4299a0e619fd89d2e8f Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 24 Sep 2023 02:35:23 +0200 Subject: [PATCH 425/501] [cleanup] Misc Authored by: Grub4K --- test/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_utils.py b/test/test_utils.py index dc2d8ce12b..fd612ff86f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2405,5 +2405,6 @@ def run_shell(args): assert run_shell(['echo', '^!']) == '"^!"\n' assert run_shell('echo "^!"') == '"^!"\n' + if __name__ == '__main__': unittest.main() From c54ddfba0f7d68034339426223d75373c5fc86df Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 24 Sep 2023 00:38:42 +0000 Subject: [PATCH 426/501] Release 2023.09.24 Created by: Grub4K :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 +- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 +- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 +- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 +- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 +- .github/ISSUE_TEMPLATE/6_question.yml | 8 +- CONTRIBUTORS | 36 ++++ Changelog.md | 196 ++++++++++++++++++ supportedsites.md | 49 ++++- yt_dlp/version.py | 4 +- 10 files changed, 298 insertions(+), 35 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index dd1b33dde2..f0fc71d575 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 4f4378924d..ac9a72a1c1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 05b4dd23b3..577e4d4910 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 880f1014c2..9529c1bd6c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index acb11795f6..b17a6e046c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index a2563e975b..5345e8917c 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6b9b9f4701..72b9584ecf 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -467,3 +467,39 @@ rdamas RfadnjdExt urectanc nao20010128nao/Lesmiscore +04-pasha-04 +aaruni96 +aky-01 +AmirAflak +ApoorvShah111 +at-wat +davinkevin +demon071 +denhotte +FinnRG +fireattack +Frankgoji +GD-Slime +hatsomatt +ifan-t +kshitiz305 +kylegustavo +mabdelfattah +nathantouze +niemands +Rajeshwaran2001 +RedDeffender +Rohxn16 +sb0stn +SevenLives +simon300000 +snixon +soundchaser128 +szabyg +trainman261 +trislee +wader +Yalab7 +zhallgato +zhong-yiyu +Zprokkel diff --git a/Changelog.md b/Changelog.md index 32cdaca2ab..04511927fa 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,202 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.09.24 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.8** +Since Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803) +- Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg) + - The shell escape function is now using `""` instead of `\"`. + - `utils.Popen` has been patched to properly quote commands. + +#### Core changes +- [Fix HTTP headers and cookie handling](https://github.com/yt-dlp/yt-dlp/commit/6c5211cebeacfc53ad5d5ddf4a659be76039656f) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- [Fix `--check-formats`](https://github.com/yt-dlp/yt-dlp/commit/8cb7fc44db010e965d808ee679ef0725cb6e147c) by [pukkandan](https://github.com/pukkandan) +- [Fix support for upcoming Python 3.12](https://github.com/yt-dlp/yt-dlp/commit/836e06d246512f286f30c1371b2c54b72c9ecd93) ([#8130](https://github.com/yt-dlp/yt-dlp/issues/8130)) by [Grub4K](https://github.com/Grub4K) +- [Merged with youtube-dl 66ab08](https://github.com/yt-dlp/yt-dlp/commit/9d6254069c75877bc88bc3584f4326fb1853a543) by [coletdjnz](https://github.com/coletdjnz) +- [Prevent RCE when using `--exec` with `%q` (CVE-2023-40581)](https://github.com/yt-dlp/yt-dlp/commit/de015e930747165dbb8fcd360f8775fd973b7d6e) by [Grub4K](https://github.com/Grub4K) +- [Raise minimum recommended Python version to 3.8](https://github.com/yt-dlp/yt-dlp/commit/61bdf15fc7400601c3da1aa7a43917310a5bf391) ([#8183](https://github.com/yt-dlp/yt-dlp/issues/8183)) by [Grub4K](https://github.com/Grub4K) +- [`FFmpegFixupM3u8PP` may need to run with ffmpeg](https://github.com/yt-dlp/yt-dlp/commit/f73c11803579889dc8e1c99e25dba9a22fef39d8) by [pukkandan](https://github.com/pukkandan) +- **compat** + - [Add `types.NoneType`](https://github.com/yt-dlp/yt-dlp/commit/e0c4db04dc82a699bdabd9821ddc239ebe17d30a) by [pukkandan](https://github.com/pukkandan) (With fixes in [25b6e8f](https://github.com/yt-dlp/yt-dlp/commit/25b6e8f94679b4458550702b46e61249b875a4fd)) + - [Deprecate old functions](https://github.com/yt-dlp/yt-dlp/commit/3d2623a898196640f7cc0fc8b70118ff19e6925d) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) + - [Ensure submodules are imported correctly](https://github.com/yt-dlp/yt-dlp/commit/a250b247334ce9f641e709cbb64974da6034a2b3) by [pukkandan](https://github.com/pukkandan) +- **cookies**: [Containers JSON should be opened as utf-8](https://github.com/yt-dlp/yt-dlp/commit/dab87ca23650fd87184ff5286b53e6985b59f71d) ([#7800](https://github.com/yt-dlp/yt-dlp/issues/7800)) by [bashonly](https://github.com/bashonly) +- **dependencies**: [Handle deprecation of `sqlite3.version`](https://github.com/yt-dlp/yt-dlp/commit/35f9a306e6934793cff100200cd03f288ec33f11) ([#8167](https://github.com/yt-dlp/yt-dlp/issues/8167)) by [bashonly](https://github.com/bashonly) +- **outtmpl**: [Fix replacement for `playlist_index`](https://github.com/yt-dlp/yt-dlp/commit/a264433c9fba147ecae2420091614186cfeeb895) by [pukkandan](https://github.com/pukkandan) +- **utils** + - [Add temporary shim for logging](https://github.com/yt-dlp/yt-dlp/commit/1b392f905d20ef1f1b300b180f867d43c9ce49b8) by [pukkandan](https://github.com/pukkandan) + - [Improve `parse_duration`](https://github.com/yt-dlp/yt-dlp/commit/af86873218c24c3859ccf575a87f2b00a73b49d0) by [bashonly](https://github.com/bashonly) + - HTTPHeaderDict: [Handle byte values](https://github.com/yt-dlp/yt-dlp/commit/3f7965105d8d2048359e67c1e8b8ebd51588143b) by [pukkandan](https://github.com/pukkandan) + - `clean_podcast_url`: [Handle more trackers](https://github.com/yt-dlp/yt-dlp/commit/2af4eeb77246b8183aae75a0a8d19f18c08115b2) ([#7556](https://github.com/yt-dlp/yt-dlp/issues/7556)) by [bashonly](https://github.com/bashonly), [mabdelfattah](https://github.com/mabdelfattah) + - `js_to_json`: [Handle `Array` objects](https://github.com/yt-dlp/yt-dlp/commit/52414d64ca7b92d3f83964cdd68247989b0c4625) by [Grub4K](https://github.com/Grub4K), [std-move](https://github.com/std-move) + +#### Extractor changes +- [Extract subtitles from SMIL manifests](https://github.com/yt-dlp/yt-dlp/commit/550e65410a7a1b105923494ac44460a4dc1a15d9) ([#7667](https://github.com/yt-dlp/yt-dlp/issues/7667)) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- [Fix `--load-pages`](https://github.com/yt-dlp/yt-dlp/commit/81b4712bca608b9015aa68a4d96661d56e9cb894) by [pukkandan](https://github.com/pukkandan) +- [Make `_search_nuxt_data` more lenient](https://github.com/yt-dlp/yt-dlp/commit/904a19ee93195ce0bd4b08bd22b186120afb5b17) by [std-move](https://github.com/std-move) +- **abematv** + - [Fix proxy handling](https://github.com/yt-dlp/yt-dlp/commit/497bbbbd7328cb705f70eced94dbd90993819a46) ([#8046](https://github.com/yt-dlp/yt-dlp/issues/8046)) by [SevenLives](https://github.com/SevenLives) + - [Temporary fix for protocol handler](https://github.com/yt-dlp/yt-dlp/commit/9f66247289b9f8ecf931833b3f5f127274dd2161) by [pukkandan](https://github.com/pukkandan) +- **amazonminitv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/538d37671a17e0782d17f08df17800e2e3bd57c8) by [bashonly](https://github.com/bashonly), [GautamMKGarg](https://github.com/GautamMKGarg) +- **antenna**: [Support antenna.gr](https://github.com/yt-dlp/yt-dlp/commit/665876034c8d3c031443f6b4958bed02ccdf4164) ([#7584](https://github.com/yt-dlp/yt-dlp/issues/7584)) by [stdedos](https://github.com/stdedos) +- **artetv**: [Fix HLS formats extraction](https://github.com/yt-dlp/yt-dlp/commit/c2da0b5ea215298135f76e3dc14b972a3c4afacb) by [bashonly](https://github.com/bashonly) +- **axs**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/aee6b9b88c0bcccf27fd23b7e00fc0b7b168928f) ([#8094](https://github.com/yt-dlp/yt-dlp/issues/8094)) by [barsnick](https://github.com/barsnick) +- **banbye**: [Support video ids containing a hyphen](https://github.com/yt-dlp/yt-dlp/commit/578a82e497502b951036ce9da6fe0dac6937ac27) ([#8059](https://github.com/yt-dlp/yt-dlp/issues/8059)) by [kshitiz305](https://github.com/kshitiz305) +- **bbc**: [Extract tracklist as chapters](https://github.com/yt-dlp/yt-dlp/commit/eda0e415d26eb084e570cf5372d38ee1f616b70f) ([#7788](https://github.com/yt-dlp/yt-dlp/issues/7788)) by [garret1317](https://github.com/garret1317) +- **bild.de**: [Extract HLS formats](https://github.com/yt-dlp/yt-dlp/commit/b4c1c408c63724339eb12b16c91b253a7ee62cfa) ([#8032](https://github.com/yt-dlp/yt-dlp/issues/8032)) by [barsnick](https://github.com/barsnick) +- **bilibili** + - [Add support for series, favorites and watch later](https://github.com/yt-dlp/yt-dlp/commit/9e68747f9607f05e92bb7d9b6e79d678b50070e1) ([#7518](https://github.com/yt-dlp/yt-dlp/issues/7518)) by [c-basalt](https://github.com/c-basalt) + - [Extract Dolby audio formats](https://github.com/yt-dlp/yt-dlp/commit/b84fda7388dd20d38921e23b469147f3957c1812) ([#8142](https://github.com/yt-dlp/yt-dlp/issues/8142)) by [ClosedPort22](https://github.com/ClosedPort22) + - [Extract `format_id`](https://github.com/yt-dlp/yt-dlp/commit/5336bf57a7061e0955a37f0542fc8ebf50d55b17) ([#7555](https://github.com/yt-dlp/yt-dlp/issues/7555)) by [c-basalt](https://github.com/c-basalt) +- **bilibilibangumi**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/bdd0b75e3f41ff35440eda6d395008beef19ef2f) ([#7337](https://github.com/yt-dlp/yt-dlp/issues/7337)) by [GD-Slime](https://github.com/GD-Slime) +- **bpb**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/f659e6439444ac64305b5c80688cd82f59d2279c) ([#8119](https://github.com/yt-dlp/yt-dlp/issues/8119)) by [Grub4K](https://github.com/Grub4K) +- **brilliantpala**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/92feb5654c5a4c81ba872904a618700fcbb3e546) ([#6680](https://github.com/yt-dlp/yt-dlp/issues/6680)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **canal1, caracoltvplay**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b3febedbeb662dfdf9b5c1d5799039ad4fc969de) ([#7151](https://github.com/yt-dlp/yt-dlp/issues/7151)) by [elyse0](https://github.com/elyse0) +- **cbc**: [Ignore any 426 from API](https://github.com/yt-dlp/yt-dlp/commit/9bf14be775289bd88cc1f5c89fd761ae51879484) ([#7689](https://github.com/yt-dlp/yt-dlp/issues/7689)) by [makew0rld](https://github.com/makew0rld) +- **cbcplayer**: [Extract HLS formats and subtitles](https://github.com/yt-dlp/yt-dlp/commit/339c339fec095ff4141b20e6aa83629117fb26df) ([#7484](https://github.com/yt-dlp/yt-dlp/issues/7484)) by [trainman261](https://github.com/trainman261) +- **cbcplayerplaylist**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ed711897814f3ee0b1822e4205e74133467e8f1c) ([#7870](https://github.com/yt-dlp/yt-dlp/issues/7870)) by [trainman261](https://github.com/trainman261) +- **cineverse**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/15591940ff102d1ae337d603a46d8f238c83a61f) ([#8146](https://github.com/yt-dlp/yt-dlp/issues/8146)) by [garret1317](https://github.com/garret1317) +- **crunchyroll**: [Remove initial state extraction](https://github.com/yt-dlp/yt-dlp/commit/9b16762f48914de9ac914601769c76668e433325) ([#7632](https://github.com/yt-dlp/yt-dlp/issues/7632)) by [Grub4K](https://github.com/Grub4K) +- **douyutv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/21f40e75dfc0055ea9cdbd7fe2c46c6f9b561afd) ([#7652](https://github.com/yt-dlp/yt-dlp/issues/7652)) by [c-basalt](https://github.com/c-basalt) +- **dropbox**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/b9f2bc2dbed2323734a0d18e65e1e2e23dc833d8) ([#7926](https://github.com/yt-dlp/yt-dlp/issues/7926)) by [bashonly](https://github.com/bashonly), [denhotte](https://github.com/denhotte), [nathantouze](https://github.com/nathantouze) (With fixes in [099fb1b](https://github.com/yt-dlp/yt-dlp/commit/099fb1b35cf835303306549f5113d1802d79c9c7) by [bashonly](https://github.com/bashonly)) +- **eplus**: inbound: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/295fbb3ae3a7d0dd50e286be5c487cf145ed5778) ([#5782](https://github.com/yt-dlp/yt-dlp/issues/5782)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **expressen**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/a5e264d74b4bd60c6e7ec4e38f1a23af4e420531) ([#8153](https://github.com/yt-dlp/yt-dlp/issues/8153)) by [kylegustavo](https://github.com/kylegustavo) +- **facebook** + - [Add dash manifest URL](https://github.com/yt-dlp/yt-dlp/commit/a854fbec56d5004f5147116a41d1dd050632a579) ([#7743](https://github.com/yt-dlp/yt-dlp/issues/7743)) by [ringus1](https://github.com/ringus1) + - [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/d3d81cc98f554d0adb87d24bfd6fabaaa803944d) ([#7890](https://github.com/yt-dlp/yt-dlp/issues/7890)) by [ringus1](https://github.com/ringus1) + - [Improve format sorting](https://github.com/yt-dlp/yt-dlp/commit/308936619c8a4f3a52d73c829c2006ff6c55fea2) ([#8074](https://github.com/yt-dlp/yt-dlp/issues/8074)) by [fireattack](https://github.com/fireattack) + - reel: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/bb5d84c9d2f1e978c3eddfb5ccbe138036682a36) ([#7564](https://github.com/yt-dlp/yt-dlp/issues/7564)) by [bashonly](https://github.com/bashonly), [demon071](https://github.com/demon071) +- **fox**: [Support foxsports.com](https://github.com/yt-dlp/yt-dlp/commit/30b29f37159e9226e2f2d5434c9a4096ac4efa2e) ([#7724](https://github.com/yt-dlp/yt-dlp/issues/7724)) by [ischmidt20](https://github.com/ischmidt20) +- **funker530**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/0ce1f48bf1cb78d40d734ce73ee1c90eccf92274) ([#8040](https://github.com/yt-dlp/yt-dlp/issues/8040)) by [04-pasha-04](https://github.com/04-pasha-04) +- **generic** + - [Fix KVS thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/53675852195d8dd859555d4789944a6887171ff8) by [bashonly](https://github.com/bashonly) + - [Fix generic title for embeds](https://github.com/yt-dlp/yt-dlp/commit/994f7ef8e6003f4b7b258528755d0b6adcc31714) by [pukkandan](https://github.com/pukkandan) +- **gofile**: [Update token](https://github.com/yt-dlp/yt-dlp/commit/99c99c7185f5d8e9b3699a6fc7f86ec663d7b97e) by [bashonly](https://github.com/bashonly) +- **hotstar** + - [Extract `release_year`](https://github.com/yt-dlp/yt-dlp/commit/7237c8dca0590aa7438ade93f927df88c9381ec7) ([#7869](https://github.com/yt-dlp/yt-dlp/issues/7869)) by [Rajeshwaran2001](https://github.com/Rajeshwaran2001) + - [Make metadata extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/30ea88591b728cca0896018dbf67c2298070c669) by [bashonly](https://github.com/bashonly) + - [Support `/clips/` URLs](https://github.com/yt-dlp/yt-dlp/commit/86eeb044c2342d68c6ef177577f87852e6badd85) ([#7710](https://github.com/yt-dlp/yt-dlp/issues/7710)) by [bashonly](https://github.com/bashonly) +- **hungama**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/4b3a6ef1b3e235ba9a45142830b6edb357c71696) ([#7757](https://github.com/yt-dlp/yt-dlp/issues/7757)) by [bashonly](https://github.com/bashonly), [Yalab7](https://github.com/Yalab7) +- **indavideoembed**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/63e0c5748c0eb461a2ccca4181616eb930b4b750) ([#8129](https://github.com/yt-dlp/yt-dlp/issues/8129)) by [aky-01](https://github.com/aky-01) +- **iprima**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/568f08051841aedea968258889539741e26009e9) ([#7216](https://github.com/yt-dlp/yt-dlp/issues/7216)) by [std-move](https://github.com/std-move) +- **lbry**: [Fix original format extraction](https://github.com/yt-dlp/yt-dlp/commit/127a22460658ac39cbe5c4b3fb88d578363e0dfa) ([#7711](https://github.com/yt-dlp/yt-dlp/issues/7711)) by [bashonly](https://github.com/bashonly) +- **lecturio**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/efa2339502a37cf13ae7f143bd8b2c28f452d1cd) ([#7649](https://github.com/yt-dlp/yt-dlp/issues/7649)) by [simon300000](https://github.com/simon300000) +- **magellantv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f4ea501551526ebcb54d19b84cf0ebe798583a85) ([#7616](https://github.com/yt-dlp/yt-dlp/issues/7616)) by [bashonly](https://github.com/bashonly) +- **massengeschmack.tv**: [Fix title extraction](https://github.com/yt-dlp/yt-dlp/commit/81f46ac573dc443ad48560f308582a26784d3015) ([#7813](https://github.com/yt-dlp/yt-dlp/issues/7813)) by [sb0stn](https://github.com/sb0stn) +- **media.ccc.de**: lists: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/cf11b40ac40e3d23a6352753296f3a732886efb9) ([#8144](https://github.com/yt-dlp/yt-dlp/issues/8144)) by [Rohxn16](https://github.com/Rohxn16) +- **mediaite**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/630a55df8de7747e79aa680959d785dfff2c4b76) ([#7923](https://github.com/yt-dlp/yt-dlp/issues/7923)) by [Grabien](https://github.com/Grabien) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6e07e4bc7e59f5bdb60e93c011e57b18b009f2b5) ([#8086](https://github.com/yt-dlp/yt-dlp/issues/8086)) by [bashonly](https://github.com/bashonly), [zhallgato](https://github.com/zhallgato) +- **mediastream**: [Make embed extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/635ae31f68a3ac7f6393d59657ed711e34ee3552) by [bashonly](https://github.com/bashonly) +- **mixcloud**: [Update API URL](https://github.com/yt-dlp/yt-dlp/commit/7b71643cc986de9a3768dac4ac9b64f4d05e7f5e) ([#8114](https://github.com/yt-dlp/yt-dlp/issues/8114)) by [garret1317](https://github.com/garret1317) +- **monstercat**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/eaee21bf71889d495076037cbe590c8c0b21ef3a) ([#8133](https://github.com/yt-dlp/yt-dlp/issues/8133)) by [garret1317](https://github.com/garret1317) +- **motortrendondemand**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c03a58ec9933e4a42c2d8fa80b8a0ddb2cde64e6) ([#7683](https://github.com/yt-dlp/yt-dlp/issues/7683)) by [AmirAflak](https://github.com/AmirAflak) +- **museai**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/65cfa2b057d7946fbe322155a778fe206556d0c6) ([#7614](https://github.com/yt-dlp/yt-dlp/issues/7614)) by [bashonly](https://github.com/bashonly) +- **mzaalo**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/d7aee8e310b2c4f21d50aac0b420e1b3abde21a4) by [bashonly](https://github.com/bashonly) +- **n1info**: article: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/8ac5b6d96ae5c60cd5ae2495949e0068a6754c45) ([#7373](https://github.com/yt-dlp/yt-dlp/issues/7373)) by [u-spec-png](https://github.com/u-spec-png) +- **nfl.com**: plus, replay: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1eaca74bc2ca0f5b1ec532f24c61de44f2e8cb2d) ([#7838](https://github.com/yt-dlp/yt-dlp/issues/7838)) by [bashonly](https://github.com/bashonly) +- **niconicochannelplus**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/698beb9a497f51693e64d167e572ff9efa4bc25f) ([#5686](https://github.com/yt-dlp/yt-dlp/issues/5686)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nitter**: [Fix title extraction fallback](https://github.com/yt-dlp/yt-dlp/commit/a83da3717d30697102e76f63a6f29d77f9373c2a) ([#8102](https://github.com/yt-dlp/yt-dlp/issues/8102)) by [ApoorvShah111](https://github.com/ApoorvShah111) +- **noodlemagazine**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/bae4834245a708fff97219849ec880c319c88bc6) ([#7830](https://github.com/yt-dlp/yt-dlp/issues/7830)) by [RedDeffender](https://github.com/RedDeffender) (With fixes in [69dbfe0](https://github.com/yt-dlp/yt-dlp/commit/69dbfe01c47cd078682a87f179f5846e2679e927) by [bashonly](https://github.com/bashonly)) +- **novaembed**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2269065ad60cb0ab62408ae6a7b20283e5252232) ([#7910](https://github.com/yt-dlp/yt-dlp/issues/7910)) by [std-move](https://github.com/std-move) +- **patreoncampaign**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/11de6fec9c9b8d34d1f90c8e6218ec58a3471b58) ([#7664](https://github.com/yt-dlp/yt-dlp/issues/7664)) by [bashonly](https://github.com/bashonly) +- **pbs**: [Add extractor `PBSKidsIE`](https://github.com/yt-dlp/yt-dlp/commit/6d6081dda1290a85bdab6717f239289e3aa74c8e) ([#7602](https://github.com/yt-dlp/yt-dlp/issues/7602)) by [snixon](https://github.com/snixon) +- **piapro**: [Support `/content` URL](https://github.com/yt-dlp/yt-dlp/commit/1bcb9fe8715b1f288efc322be3de409ee0597080) ([#7592](https://github.com/yt-dlp/yt-dlp/issues/7592)) by [FinnRG](https://github.com/FinnRG) +- **piaulizaportal**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6636021206dad17c7745ae6bce6cb73d6f2ef319) ([#7903](https://github.com/yt-dlp/yt-dlp/issues/7903)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **picartovod**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/db9743894071760f994f640a4c24358f749a78c0) ([#7727](https://github.com/yt-dlp/yt-dlp/issues/7727)) by [Frankgoji](https://github.com/Frankgoji) +- **pornbox**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/40999467f72db074a3f13057da9bf82a857530fe) ([#7386](https://github.com/yt-dlp/yt-dlp/issues/7386)) by [niemands](https://github.com/niemands) +- **pornhub**: [Update access cookies for UK](https://github.com/yt-dlp/yt-dlp/commit/1d3d579c2142f69831b6ae140e1d8e824e07fa0e) ([#7591](https://github.com/yt-dlp/yt-dlp/issues/7591)) by [zhong-yiyu](https://github.com/zhong-yiyu) +- **pr0gramm**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/b532556d0a85e7d76f8f0880861232fb706ddbc5) ([#8151](https://github.com/yt-dlp/yt-dlp/issues/8151)) by [Grub4K](https://github.com/Grub4K) +- **radiofrance**: [Add support for livestreams, podcasts, playlists](https://github.com/yt-dlp/yt-dlp/commit/ba8e9eb2c8bbb699f314169fab8e544437ad731e) ([#7006](https://github.com/yt-dlp/yt-dlp/issues/7006)) by [elyse0](https://github.com/elyse0) +- **rbgtum**: [Fix extraction and support new URL format](https://github.com/yt-dlp/yt-dlp/commit/5fccabac27ca3c1165ade1b0df6fbadc24258dc2) ([#7690](https://github.com/yt-dlp/yt-dlp/issues/7690)) by [simon300000](https://github.com/simon300000) +- **reddit** + - [Extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/20c3c9b433dd47faf0dbde6b46e4e34eb76109a5) by [bashonly](https://github.com/bashonly) + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/9a04113dfbb69b904e4e2bea736da293505786b8) by [bashonly](https://github.com/bashonly) +- **rtvslo**: [Fix format extraction](https://github.com/yt-dlp/yt-dlp/commit/94389b225d9bcf29aa7ba8afaf1bbd7c62204eae) ([#8131](https://github.com/yt-dlp/yt-dlp/issues/8131)) by [bashonly](https://github.com/bashonly) +- **rule34video**: [Extract tags](https://github.com/yt-dlp/yt-dlp/commit/58493923e9b6f774947a2131e5258e9f3cf816be) ([#7117](https://github.com/yt-dlp/yt-dlp/issues/7117)) by [soundchaser128](https://github.com/soundchaser128) +- **rumble**: [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/23d829a3420450bcfb0788e6fb2cf4f6acdbe596) ([#8035](https://github.com/yt-dlp/yt-dlp/issues/8035)) by [trislee](https://github.com/trislee) +- **s4c** + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b9de629d78ce31699f2de886071dc257830f9676) ([#7730](https://github.com/yt-dlp/yt-dlp/issues/7730)) by [ifan-t](https://github.com/ifan-t) + - [Add series support and extract subs/thumbs](https://github.com/yt-dlp/yt-dlp/commit/fe371dcf0ba5ce8d42480eade54eeeac99ab3cb0) ([#7776](https://github.com/yt-dlp/yt-dlp/issues/7776)) by [ifan-t](https://github.com/ifan-t) +- **sohu**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5be7e978867b5f66ad6786c674d79d40e950ae16) ([#7628](https://github.com/yt-dlp/yt-dlp/issues/7628)) by [bashonly](https://github.com/bashonly), [c-basalt](https://github.com/c-basalt) +- **stageplus**: [Fix m3u8 extraction](https://github.com/yt-dlp/yt-dlp/commit/56b3dc03354b75be995759d8441d2754c0442b9a) ([#7929](https://github.com/yt-dlp/yt-dlp/issues/7929)) by [bashonly](https://github.com/bashonly) +- **streamanity**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/2cfe221fbbe46faa3f46552c08d947a51f424903) ([#7571](https://github.com/yt-dlp/yt-dlp/issues/7571)) by [alerikaisattera](https://github.com/alerikaisattera) +- **svtplay**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/2301b5c1b77a65abbb46b72f91e1e4666fd5d985) ([#7789](https://github.com/yt-dlp/yt-dlp/issues/7789)) by [dirkf](https://github.com/dirkf), [wader](https://github.com/wader) +- **tbsjp**: [Add episode, program, playlist extractors](https://github.com/yt-dlp/yt-dlp/commit/876b70c8edf4c0147f180bd981fbc4d625cbfb9c) ([#7765](https://github.com/yt-dlp/yt-dlp/issues/7765)) by [garret1317](https://github.com/garret1317) +- **tiktok** + - [Fix audio-only format extraction](https://github.com/yt-dlp/yt-dlp/commit/b09bd0c19648f60c59fb980cd454cb0069959fb9) ([#7712](https://github.com/yt-dlp/yt-dlp/issues/7712)) by [bashonly](https://github.com/bashonly) + - [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/069cbece9dba6384f1cc5fcfc7ce562a31af42fc) by [bashonly](https://github.com/bashonly) +- **triller**: [Fix unlisted video extraction](https://github.com/yt-dlp/yt-dlp/commit/39837ae3199aa934299badbd0d63243ed639e6c8) ([#7670](https://github.com/yt-dlp/yt-dlp/issues/7670)) by [bashonly](https://github.com/bashonly) +- **tv5mondeplus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7d3d658f4c558ee7d72b1c01b46f2126948681cd) ([#7952](https://github.com/yt-dlp/yt-dlp/issues/7952)) by [dirkf](https://github.com/dirkf), [korli](https://github.com/korli) +- **twitcasting** + - [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/cebbd33b1c678149fc8f0e254db6fc0da317ea80) ([#8120](https://github.com/yt-dlp/yt-dlp/issues/8120)) by [c-basalt](https://github.com/c-basalt) + - [Support `--wait-for-video`](https://github.com/yt-dlp/yt-dlp/commit/c1d71d0d9f41db5e4306c86af232f5f6220a130b) ([#7975](https://github.com/yt-dlp/yt-dlp/issues/7975)) by [at-wat](https://github.com/at-wat) +- **twitter** + - [Add fallback, improve error handling](https://github.com/yt-dlp/yt-dlp/commit/6014355c6142f68e20c8374e3787e5b5820f19e2) ([#7621](https://github.com/yt-dlp/yt-dlp/issues/7621)) by [bashonly](https://github.com/bashonly) + - [Fix GraphQL and legacy API](https://github.com/yt-dlp/yt-dlp/commit/92315c03774cfabb3a921884326beb4b981f786b) ([#7516](https://github.com/yt-dlp/yt-dlp/issues/7516)) by [bashonly](https://github.com/bashonly) + - [Fix retweet extraction and syndication API](https://github.com/yt-dlp/yt-dlp/commit/a006ce2b27357c15792eb5c18f06765e640b801c) ([#8016](https://github.com/yt-dlp/yt-dlp/issues/8016)) by [bashonly](https://github.com/bashonly) + - [Revert 92315c03774cfabb3a921884326beb4b981f786b](https://github.com/yt-dlp/yt-dlp/commit/b03fa7834579a01cc5fba48c0e73488a16683d48) by [pukkandan](https://github.com/pukkandan) + - spaces + - [Fix format protocol](https://github.com/yt-dlp/yt-dlp/commit/613dbce177d34ffc31053e8e01acf4bb107bcd1e) ([#7550](https://github.com/yt-dlp/yt-dlp/issues/7550)) by [bashonly](https://github.com/bashonly) + - [Pass referer header to downloader](https://github.com/yt-dlp/yt-dlp/commit/c6ef553792ed48462f9fd0e78143bef6b1a71c2e) by [bashonly](https://github.com/bashonly) +- **unsupported**: [List more sites with DRM](https://github.com/yt-dlp/yt-dlp/commit/e7057383380d7d53815f8feaf90ca3dcbde88983) by [pukkandan](https://github.com/pukkandan) +- **videa**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/98eac0e6ba0e510ae7dfdfd249d42ee71fb272b1) ([#8003](https://github.com/yt-dlp/yt-dlp/issues/8003)) by [aky-01](https://github.com/aky-01), [hatsomatt](https://github.com/hatsomatt) +- **vrt**: [Update token signing key](https://github.com/yt-dlp/yt-dlp/commit/325191d0c9bf3fe257b8a7c2eb95080f44f6ddfc) ([#7519](https://github.com/yt-dlp/yt-dlp/issues/7519)) by [Zprokkel](https://github.com/Zprokkel) +- **wat.tv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7cccab79e7d00ed965b48b8cefce1da8a0513409) ([#7898](https://github.com/yt-dlp/yt-dlp/issues/7898)) by [davinkevin](https://github.com/davinkevin) +- **wdr**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0395498d7065aa5e55bac85fa9354b4b0d48eb) ([#7979](https://github.com/yt-dlp/yt-dlp/issues/7979)) by [szabyg](https://github.com/szabyg) +- **web.archive**: vlive: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/9652bca1bd02f6bc1b8cb1e186f2ccbf32225561) ([#8132](https://github.com/yt-dlp/yt-dlp/issues/8132)) by [bashonly](https://github.com/bashonly) +- **weibo**: [Fix extractor and support user extraction](https://github.com/yt-dlp/yt-dlp/commit/69b03f84f8378b0b5a2fbae56f9b7d860b2f529e) ([#7657](https://github.com/yt-dlp/yt-dlp/issues/7657)) by [c-basalt](https://github.com/c-basalt) +- **weverse**: [Support extraction without auth](https://github.com/yt-dlp/yt-dlp/commit/c2d8ee0000302aba63476b7d5bd8793e57b6c8c6) ([#7924](https://github.com/yt-dlp/yt-dlp/issues/7924)) by [seproDev](https://github.com/seproDev) +- **wimbledon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a15fcd299e767a510debd8dc1646fe863b96ce0e) ([#7551](https://github.com/yt-dlp/yt-dlp/issues/7551)) by [nnoboa](https://github.com/nnoboa) +- **wrestleuniverseppv**: [Fix HLS AES key extraction](https://github.com/yt-dlp/yt-dlp/commit/dae349da97cafe7357106a8f3187fd48a2ad1210) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `player_params` extractor arg](https://github.com/yt-dlp/yt-dlp/commit/ba06d77a316650ff057347d224b5afa8b203ad65) ([#7719](https://github.com/yt-dlp/yt-dlp/issues/7719)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `player_params` arg being converted to lowercase](https://github.com/yt-dlp/yt-dlp/commit/546b2c28a106cf8101d481b215b676d1b091d276) by [coletdjnz](https://github.com/coletdjnz) + - [Fix consent cookie](https://github.com/yt-dlp/yt-dlp/commit/378ae9f9fb8e8c86e6ac89c4c5b815b48ce93620) ([#7774](https://github.com/yt-dlp/yt-dlp/issues/7774)) by [coletdjnz](https://github.com/coletdjnz) + - tab: [Detect looping feeds](https://github.com/yt-dlp/yt-dlp/commit/1ba6fe9db5f660d5538588315c23ad6cf0371c5f) ([#6621](https://github.com/yt-dlp/yt-dlp/issues/6621)) by [coletdjnz](https://github.com/coletdjnz) +- **zaiko**: [Improve thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/ecef42c3adbcb6a84405139047923c4967316f28) ([#8054](https://github.com/yt-dlp/yt-dlp/issues/8054)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **zee5**: [Update access token endpoint](https://github.com/yt-dlp/yt-dlp/commit/a0de8bb8601146b8f87bf7cd562eef8bfb4690be) ([#7914](https://github.com/yt-dlp/yt-dlp/issues/7914)) by [bashonly](https://github.com/bashonly) +- **zoom**: [Extract duration](https://github.com/yt-dlp/yt-dlp/commit/66cc64ff6696f9921ff112a278542f8d999ffea4) by [bashonly](https://github.com/bashonly) + +#### Downloader changes +- **external** + - [Fix ffmpeg input from stdin](https://github.com/yt-dlp/yt-dlp/commit/e57eb98222d29cc4c09ee975d3c492274a6e5be3) ([#7655](https://github.com/yt-dlp/yt-dlp/issues/7655)) by [bashonly](https://github.com/bashonly) + - [Fixes to cookie handling](https://github.com/yt-dlp/yt-dlp/commit/42ded0a429c20ec13dc006825e1508d9a02f0ad4) by [bashonly](https://github.com/bashonly) + +#### Postprocessor changes +- **embedthumbnail**: [Support `m4v`](https://github.com/yt-dlp/yt-dlp/commit/8a4cd12c8f8e93292e3e95200b9d17a3af39624c) ([#7583](https://github.com/yt-dlp/yt-dlp/issues/7583)) by [Neurognostic](https://github.com/Neurognostic) + +#### Networking changes +- [Add module](https://github.com/yt-dlp/yt-dlp/commit/c365dba8430ee33abda85d31f95128605bf240eb) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [pukkandan](https://github.com/pukkandan) +- [Add request handler preference framework](https://github.com/yt-dlp/yt-dlp/commit/db7b054a6111ca387220d0eb87bf342f9c130eb8) ([#7603](https://github.com/yt-dlp/yt-dlp/issues/7603)) by [coletdjnz](https://github.com/coletdjnz) +- [Add strict Request extension checking](https://github.com/yt-dlp/yt-dlp/commit/86aea0d3a213da3be1da638b9b828e6f0ee1d59f) ([#7604](https://github.com/yt-dlp/yt-dlp/issues/7604)) by [coletdjnz](https://github.com/coletdjnz) +- [Fix POST requests with zero-length payloads](https://github.com/yt-dlp/yt-dlp/commit/71baa490ebd3655746430f208a9b605d120cd315) ([#7648](https://github.com/yt-dlp/yt-dlp/issues/7648)) by [bashonly](https://github.com/bashonly) +- [Fix `--legacy-server-connect`](https://github.com/yt-dlp/yt-dlp/commit/75dc8e673b481a82d0688aeec30f6c65d82bb359) ([#7645](https://github.com/yt-dlp/yt-dlp/issues/7645)) by [bashonly](https://github.com/bashonly) +- [Fix various socks proxy bugs](https://github.com/yt-dlp/yt-dlp/commit/20fbbd9249a2f26c7ae579bde5ba5d69aa8fac69) ([#8065](https://github.com/yt-dlp/yt-dlp/issues/8065)) by [coletdjnz](https://github.com/coletdjnz) +- [Ignore invalid proxies in env](https://github.com/yt-dlp/yt-dlp/commit/bbeacff7fcaa3b521066088a5ccbf34ef5070d1d) ([#7704](https://github.com/yt-dlp/yt-dlp/issues/7704)) by [coletdjnz](https://github.com/coletdjnz) +- [Rewrite architecture](https://github.com/yt-dlp/yt-dlp/commit/227bf1a33be7b89cd7d44ad046844c4ccba104f4) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler** + - urllib + - [Remove dot segments during URL normalization](https://github.com/yt-dlp/yt-dlp/commit/4bf912282a34b58b6b35d8f7e6be535770c89c76) ([#7662](https://github.com/yt-dlp/yt-dlp/issues/7662)) by [coletdjnz](https://github.com/coletdjnz) + - [Simplify gzip decoding](https://github.com/yt-dlp/yt-dlp/commit/59e92b1f1833440bb2190f847eb735cf0f90bc85) ([#7611](https://github.com/yt-dlp/yt-dlp/issues/7611)) by [Grub4K](https://github.com/Grub4K) (With fixes in [77bff23](https://github.com/yt-dlp/yt-dlp/commit/77bff23ee97565bab2e0d75b893a21bf7983219a)) + +#### Misc. changes +- **build**: [Make sure deprecated modules are added](https://github.com/yt-dlp/yt-dlp/commit/131d132da5c98c6c78bd7eed4b37f4458561b3d9) by [pukkandan](https://github.com/pukkandan) +- **cleanup** + - [Add color to `download-archive` message](https://github.com/yt-dlp/yt-dlp/commit/2b029ca0a9f9105c4f7626993fa60e54c9782749) ([#5138](https://github.com/yt-dlp/yt-dlp/issues/5138)) by [aaruni96](https://github.com/aaruni96), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - Miscellaneous + - [6148833](https://github.com/yt-dlp/yt-dlp/commit/6148833f5ceb7674142ddb8d761ffe03cee7df69), [62b5c94](https://github.com/yt-dlp/yt-dlp/commit/62b5c94cadaa5f596dc1a7083db9db12efe357be) by [pukkandan](https://github.com/pukkandan) + - [5ca095c](https://github.com/yt-dlp/yt-dlp/commit/5ca095cbcde3e32642a4fe5b2d69e8e3c785a021) by [barsnick](https://github.com/barsnick), [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [sqrtNOT](https://github.com/sqrtNOT) + - [088add9](https://github.com/yt-dlp/yt-dlp/commit/088add9567d39b758737e4299a0e619fd89d2e8f) by [Grub4K](https://github.com/Grub4K) +- **devscripts**: `make_changelog`: [Fix changelog grouping and add networking group](https://github.com/yt-dlp/yt-dlp/commit/30ba233d4cee945756ed7344e7ddb3a90d2ae608) ([#8124](https://github.com/yt-dlp/yt-dlp/issues/8124)) by [Grub4K](https://github.com/Grub4K) +- **docs**: [Update collaborators](https://github.com/yt-dlp/yt-dlp/commit/1be0a96a4d14f629097509fcc89d15f69a8243c7) by [Grub4K](https://github.com/Grub4K) +- **test** + - [Add tests for socks proxies](https://github.com/yt-dlp/yt-dlp/commit/fcd6a76adc49d5cd8783985c7ce35384b72e545f) ([#7908](https://github.com/yt-dlp/yt-dlp/issues/7908)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `httplib_validation_errors` test for old Python versions](https://github.com/yt-dlp/yt-dlp/commit/95abea9a03289da1384e5bda3d590223ccc0a238) ([#7677](https://github.com/yt-dlp/yt-dlp/issues/7677)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `test_load_certifi`](https://github.com/yt-dlp/yt-dlp/commit/de20687ee6b742646128a7629b57096631a20619) by [pukkandan](https://github.com/pukkandan) + - download: [Test for `expected_exception`](https://github.com/yt-dlp/yt-dlp/commit/661c9a1d029296b28e0b2f8be8a72a43abaf6536) by [at-wat](https://github.com/at-wat) + ### 2023.07.06 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 379d28ef38..620e0f3058 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -77,7 +77,7 @@ # Supported sites - **AnimalPlanet** - **ant1newsgr:article**: ant1news.gr articles - **ant1newsgr:embed**: ant1news.gr embedded videos - - **ant1newsgr:watch**: ant1news.gr videos + - **antenna:watch**: antenna.gr and ant1news.gr videos - **Anvato** - **aol.com**: Yahoo screen and movies - **APA** @@ -98,8 +98,6 @@ # Supported sites - **ArteTVCategory** - **ArteTVEmbed** - **ArteTVPlaylist** - - **AsianCrush** - - **AsianCrushPlaylist** - **AtresPlayer**: [*atresplayer*](## "netrc machine") - **AtScaleConfEvent** - **ATTTechChannel** @@ -118,6 +116,7 @@ # Supported sites - **awaan:live** - **awaan:season** - **awaan:video** + - **axs.tv** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 - **BanBye** @@ -162,11 +161,16 @@ # Supported sites - **BilibiliAudioAlbum** - **BiliBiliBangumi** - **BiliBiliBangumiMedia** + - **BiliBiliBangumiSeason** + - **BilibiliCollectionList** + - **BilibiliFavoritesList** - **BiliBiliPlayer** + - **BilibiliPlaylist** - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix + - **BilibiliSeriesList** - **BilibiliSpaceAudio** - - **BilibiliSpacePlaylist** - **BilibiliSpaceVideo** + - **BilibiliWatchlater** - **BiliIntl**: [*biliintl*](## "netrc machine") - **biliIntl:series**: [*biliintl*](## "netrc machine") - **BiliLive** @@ -201,6 +205,8 @@ # Supported sites - **BreitBart** - **brightcove:legacy** - **brightcove:new** + - **Brilliantpala:Classes**: [*brilliantpala*](## "netrc machine") VoD on classes.brilliantpala.org + - **Brilliantpala:Elearn**: [*brilliantpala*](## "netrc machine") VoD on elearn.brilliantpala.org - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen @@ -220,14 +226,17 @@ # Supported sites - **Camsoda** - **CamtasiaEmbed** - **CamWithHer** + - **Canal1** - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr + - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - **CarambaTV** - **CarambaTVPage** - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** + - **cbc.ca:​player:playlist** - **CBS** - **CBSInteractive** - **CBSLocal** @@ -257,6 +266,8 @@ # Supported sites - **Cinchcast** - **Cinemax** - **CinetecaMilano** + - **Cineverse** + - **CineverseDetails** - **CiscoLiveSearch** - **CiscoLiveSession** - **ciscowebex**: Cisco Webex @@ -365,7 +376,7 @@ # Supported sites - **Dotsub** - **Douyin** - **DouyuShow** - - **DouyuTV**: 斗鱼 + - **DouyuTV**: 斗鱼直播 - **DPlay** - **DRBonanza** - **Drooble** @@ -408,6 +419,7 @@ # Supported sites - **Engadget** - **Epicon** - **EpiconSeries** + - **eplus:inbound**: e+ (イープラス) overseas - **Epoch** - **Eporner** - **EroProfile**: [*eroprofile*](## "netrc machine") @@ -732,6 +744,7 @@ # Supported sites - **lynda**: [*lynda*](## "netrc machine") lynda.com videos - **lynda:course**: [*lynda*](## "netrc machine") lynda.com online courses - **m6** + - **MagellanTV** - **MagentaMusik360** - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru @@ -812,6 +825,7 @@ # Supported sites - **Mofosex** - **MofosexEmbed** - **Mojvideo** + - **Monstercat** - **MonsterSirenHypergryphMusic** - **Morningstar**: morningstar.com - **Motherless** @@ -840,6 +854,7 @@ # Supported sites - **MujRozhlas** - **Murrtube** - **MurrtubeUser**: Murrtube user profile + - **MuseAI** - **MuseScore** - **MusicdexAlbum** - **MusicdexArtist** @@ -944,6 +959,9 @@ # Supported sites - **niconico:playlist** - **niconico:series** - **niconico:tag**: NicoNico video tag URLs + - **NiconicoChannelPlus**: ニコニコチャンネルプラス + - **NiconicoChannelPlus:​channel:lives**: ニコニコチャンネルプラス - チャンネル - ライブリスト. nicochannel.jp/channel/lives + - **NiconicoChannelPlus:​channel:videos**: ニコニコチャンネルプラス - チャンネル - 動画リスト. nicochannel.jp/channel/videos - **NiconicoUser** - **nicovideo:search**: Nico video search; "nicosearch:" prefix - **nicovideo:​search:date**: Nico video search, newest first; "nicosearchdate:" prefix @@ -1046,6 +1064,7 @@ # Supported sites - **Patreon** - **PatreonCampaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) + - **PBSKids** - **PearVideo** - **PeekVids** - **peer.tv** @@ -1062,6 +1081,7 @@ # Supported sites - **phoenix.de** - **Photobucket** - **Piapro**: [*piapro*](## "netrc machine") + - **PIAULIZAPortal**: ulizaportal.jp - PIA LIVE STREAM - **Picarto** - **PicartoVod** - **Piksel** @@ -1105,6 +1125,7 @@ # Supported sites - **polskieradio:​podcast:list** - **Popcorntimes** - **PopcornTV** + - **Pornbox** - **PornCom** - **PornerBros** - **Pornez** @@ -1121,7 +1142,6 @@ # Supported sites - **PornTop** - **PornTube** - **Pr0gramm** - - **Pr0grammStatic** - **PrankCast** - **PremiershipRugby** - **PressTV** @@ -1156,6 +1176,10 @@ # Supported sites - **radiocanada** - **radiocanada:audiovideo** - **radiofrance** + - **RadioFranceLive** + - **RadioFrancePodcast** + - **RadioFranceProfile** + - **RadioFranceProgramSchedule** - **RadioJavan** - **radiokapital** - **radiokapital:show** @@ -1177,6 +1201,7 @@ # Supported sites - **RayWenderlichCourse** - **RbgTum** - **RbgTumCourse** + - **RbgTumNewCourse** - **RBMARadio** - **RCS** - **RCSEmbeds** @@ -1259,6 +1284,8 @@ # Supported sites - **Ruutu** - **Ruv** - **ruv.is:spila** + - **S4C** + - **S4CSeries** - **safari**: [*safari*](## "netrc machine") safaribooksonline.com online video - **safari:api**: [*safari*](## "netrc machine") - **safari:course**: [*safari*](## "netrc machine") safaribooksonline.com online courses @@ -1325,6 +1352,7 @@ # Supported sites - **Smotrim** - **Snotr** - **Sohu** + - **SohuV** - **SonyLIV**: [*sonyliv*](## "netrc machine") - **SonyLIVSeries** - **soundcloud**: [*soundcloud*](## "netrc machine") @@ -1378,7 +1406,6 @@ # Supported sites - **StoryFireSeries** - **StoryFireUser** - **Streamable** - - **Streamanity** - **streamcloud.eu** - **StreamCZ** - **StreamFF** @@ -1403,6 +1430,9 @@ # Supported sites - **Tagesschau** - **Tass** - **TBS** + - **TBSJPEpisode** + - **TBSJPPlaylist** + - **TBSJPProgram** - **TDSLifeway** - **Teachable**: [*teachable*](## "netrc machine") - **TeachableCourse**: [*teachable*](## "netrc machine") @@ -1702,7 +1732,6 @@ # Supported sites - **wdr:mobile**: (**Currently broken**) - **WDRElefant** - **WDRPage** - - **web.archive:vlive**: web.archive.org saved vlive videos - **web.archive:youtube**: web.archive.org saved youtube videos, "ytarchive:" prefix - **Webcamerapl** - **Webcaster** @@ -1710,7 +1739,8 @@ # Supported sites - **WebOfStories** - **WebOfStoriesPlaylist** - **Weibo** - - **WeiboMobile** + - **WeiboUser** + - **WeiboVideo** - **WeiqiTV**: WQTV - **wetv:episode** - **WeTvSeries** @@ -1726,6 +1756,7 @@ # Supported sites - **Whyp** - **wikimedia.org** - **Willow** + - **Wimbledon** - **WimTV** - **WinSportsVideo** - **Wistia** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 67cfe44efd..2a7c84b93f 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.07.06' +__version__ = '2023.09.24' -RELEASE_GIT_HEAD = 'b532a3481046e1eabb6232ee8196fb696c356ff6' +RELEASE_GIT_HEAD = '088add9567d39b758737e4299a0e619fd89d2e8f' VARIANT = None From eb5bdbfa70126c7d5355cc0954b63720522e462c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Tue, 3 Oct 2023 19:42:30 +1300 Subject: [PATCH 427/501] [ie/youtube] Raise a warning for `Incomplete Data` instead of an error (#8238) Closes https://github.com/yt-dlp/yt-dlp/issues/8206 Adds `raise_incomplete_data` extractor arg to revert this behaviour and raise an error. Authored by: coletdjnz Co-authored-by: Simon Sawicki --- README.md | 1 + yt_dlp/extractor/youtube.py | 26 +++++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7bf4465721..a0b69c9a1a 100644 --- a/README.md +++ b/README.md @@ -1809,6 +1809,7 @@ #### youtube * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests +* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a39d17cf11..7e13aa7797 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -941,7 +941,13 @@ def _parse_time_text(self, text): def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): - for retry in self.RetryManager(): + raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=YoutubeIE)) + # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal. + icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete)) + icd_rm = next(icd_retries) + main_retries = iter(self.RetryManager()) + main_rm = next(main_retries) + for _ in range(main_rm.retries + icd_rm.retries + 1): try: response = self._call_api( ep=ep, fatal=True, headers=headers, @@ -953,7 +959,8 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers if not isinstance(e.cause, network_exceptions): return self._error_or_warning(e, fatal=fatal) elif not isinstance(e.cause, HTTPError): - retry.error = e + main_rm.error = e + next(main_retries) continue first_bytes = e.cause.response.read(512) @@ -965,27 +972,32 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers if yt_error: self._report_alerts([('ERROR', yt_error)], fatal=False) # Downloading page may result in intermittent 5xx HTTP error - # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 if e.cause.status not in (403, 429): - retry.error = e + main_rm.error = e + next(main_retries) continue return self._error_or_warning(e, fatal=fatal) try: self._extract_and_report_alerts(response, only_once=True) except ExtractorError as e: - # YouTube servers may return errors we want to retry on in a 200 OK response + # YouTube's servers may return errors we want to retry on in a 200 OK response # See: https://github.com/yt-dlp/yt-dlp/issues/839 if 'unknown error' in e.msg.lower(): - retry.error = e + main_rm.error = e + next(main_retries) continue return self._error_or_warning(e, fatal=fatal) # Youtube sometimes sends incomplete data # See: https://github.com/ytdl-org/youtube-dl/issues/28194 if not traverse_obj(response, *variadic(check_get_keys)): - retry.error = ExtractorError('Incomplete data received', expected=True) + icd_rm.error = ExtractorError('Incomplete data received', expected=True) + should_retry = next(icd_retries, None) + if not should_retry: + return None continue return response From cc8d8441524ec3442d7c0d3f8f33f15b66aa06f3 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Tue, 3 Oct 2023 11:33:40 +0200 Subject: [PATCH 428/501] [ie/xhamster:user] Support creator urls (#8232) Authored by: Grub4K --- yt_dlp/extractor/xhamster.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index 37224799bf..aec1f20bb8 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -407,7 +407,7 @@ def _real_extract(self, url): class XHamsterUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)' % XHamsterIE._DOMAINS + _VALID_URL = rf'https?://(?:[^/?#]+\.)?{XHamsterIE._DOMAINS}/(?:(?Pusers)|creators)/(?P[^/?#&]+)' _TESTS = [{ # Paginated user profile 'url': 'https://xhamster.com/users/netvideogirls/videos', @@ -422,6 +422,12 @@ class XHamsterUserIE(InfoExtractor): 'id': 'firatkaan', }, 'playlist_mincount': 1, + }, { + 'url': 'https://xhamster.com/creators/squirt-orgasm-69', + 'info_dict': { + 'id': 'squirt-orgasm-69', + }, + 'playlist_mincount': 150, }, { 'url': 'https://xhday.com/users/mobhunter', 'only_matching': True, @@ -430,8 +436,9 @@ class XHamsterUserIE(InfoExtractor): 'only_matching': True, }] - def _entries(self, user_id): - next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id + def _entries(self, user_id, is_user): + prefix, suffix = ('users', 'videos') if is_user else ('creators', 'exclusive') + next_page_url = f'https://xhamster.com/{prefix}/{user_id}/{suffix}/1' for pagenum in itertools.count(1): page = self._download_webpage( next_page_url, user_id, 'Downloading page %s' % pagenum) @@ -454,5 +461,5 @@ def _entries(self, user_id): break def _real_extract(self, url): - user_id = self._match_id(url) - return self.playlist_result(self._entries(user_id), user_id) + user, user_id = self._match_valid_url(url).group('user', 'id') + return self.playlist_result(self._entries(user_id, bool(user)), user_id) From 0730d5a966fa8a937d84bfb7f68be5198acb039b Mon Sep 17 00:00:00 2001 From: bashonly Date: Wed, 4 Oct 2023 12:44:13 -0500 Subject: [PATCH 429/501] [ie/gofile] Fix token cookie bug Authored by: bashonly --- yt_dlp/extractor/gofile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py index 8983905839..ef14b57d08 100644 --- a/yt_dlp/extractor/gofile.py +++ b/yt_dlp/extractor/gofile.py @@ -60,7 +60,7 @@ def _real_initialize(self): account_data = self._download_json( 'https://api.gofile.io/createAccount', None, note='Getting a new guest account') self._TOKEN = account_data['data']['token'] - self._set_cookie('gofile.io', 'accountToken', self._TOKEN) + self._set_cookie('.gofile.io', 'accountToken', self._TOKEN) def _entries(self, file_id): query_params = { From b095fd3fa9d58a65dc9b830bd63b9d909422aa86 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 4 Oct 2023 13:01:52 -0500 Subject: [PATCH 430/501] [ie/WrestleUniverseVOD] Call API with device ID (#8272) Closes #8271 Authored by: bashonly --- yt_dlp/extractor/wrestleuniverse.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index dd12804db3..145246a148 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -190,10 +190,7 @@ class WrestleUniverseVODIE(WrestleUniverseBaseIE): def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') metadata = self._download_metadata(url, video_id, lang, 'videoEpisodeFallbackData') - video_data = self._call_api(video_id, ':watch', 'watch', data={ - # 'deviceId' is required if ignoreDeviceRestriction is False - 'ignoreDeviceRestriction': True, - }) + video_data = self._call_api(video_id, ':watch', 'watch', data={'deviceId': self._DEVICE_ID}) return { 'id': video_id, From 91a670a4f7babe9c8aa2018f57d8c8952a6f49d8 Mon Sep 17 00:00:00 2001 From: gillux Date: Sat, 7 Oct 2023 06:27:54 +0800 Subject: [PATCH 431/501] [ie/LiTV] Fix extractor (#7785) Closes #5456 Authored by: jiru --- yt_dlp/extractor/litv.py | 48 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 19b298ec6c..2c7c7175ea 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -13,7 +13,7 @@ class LiTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' - _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s' _TESTS = [{ 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', @@ -21,16 +21,18 @@ class LiTVIE(InfoExtractor): 'id': 'VOD00041606', 'title': '花千骨', }, - 'playlist_count': 50, + 'playlist_count': 51, # 50 episodes + 1 trailer }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', - 'md5': '969e343d9244778cb29acec608e53640', + 'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', 'title': '花千骨第1集', 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。', + 'categories': ['奇幻', '愛情', '中國', '仙俠'], + 'episode': 'Episode 1', 'episode_number': 1, }, 'params': { @@ -46,20 +48,17 @@ class LiTVIE(InfoExtractor): 'title': '芈月傳第1集 霸星芈月降世楚國', 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, - 'skip': 'Georestricted to Taiwan', + 'skip': 'No longer exists', }] - def _extract_playlist(self, season_list, video_id, program_info, prompt=True): - episode_title = program_info['title'] - content_id = season_list['contentId'] - + def _extract_playlist(self, playlist_data, content_type): all_episodes = [ self.url_result(smuggle_url( - self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), + self._URL_TEMPLATE % (content_type, episode['contentId']), {'force_noplaylist': True})) # To prevent infinite recursion - for episode in season_list['episode']] + for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))] - return self.playlist_result(all_episodes, content_id, episode_title) + return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title')) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -68,24 +67,31 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) + if self._search_regex( + r'(?i)]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"', + webpage, 'meta refresh redirect', default=False, group=0): + raise ExtractorError('No such content found', expected=True) + program_info = self._parse_json(self._search_regex( r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), video_id) - season_list = list(program_info.get('seasonList', {}).values()) - playlist_id = traverse_obj(season_list, 0, 'contentId') - if self._yes_playlist(playlist_id, video_id, smuggled_data): - return self._extract_playlist(season_list[0], video_id, program_info) - - # In browsers `getMainUrl` request is always issued. Usually this + # In browsers `getProgramInfo` request is always issued. Usually this # endpoint gives the same result as the data embedded in the webpage. - # If georestricted, there are no embedded data, so an extra request is - # necessary to get the error code + # If, for some reason, there are no embedded data, we do an extra request. if 'assetId' not in program_info: program_info = self._download_json( 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, query={'contentId': video_id}, headers={'Accept': 'application/json'}) + + series_id = program_info['seriesId'] + if self._yes_playlist(series_id, video_id, smuggled_data): + playlist_data = self._download_json( + 'https://www.litv.tv/vod/ajax/getSeriesTree', video_id, + query={'seriesId': series_id}, headers={'Accept': 'application/json'}) + return self._extract_playlist(playlist_data, program_info['contentType']) + video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) @@ -96,7 +102,7 @@ def _real_extract(self, url): 'contentType': program_info['contentType'], } video_data = self._download_json( - 'https://www.litv.tv/vod/getMainUrl', video_id, + 'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id, data=json.dumps(payload).encode('utf-8'), headers={'Content-Type': 'application/json'}) From f980df734cf5c0eaded2f7b38c6c60bccfeebb48 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 6 Oct 2023 18:31:33 -0400 Subject: [PATCH 432/501] [ie/neteasemusic] Fix extractors (#8181) Closes #4388 Authored by: c-basalt --- yt_dlp/extractor/neteasemusic.py | 575 +++++++++++++++++-------------- 1 file changed, 312 insertions(+), 263 deletions(-) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 5b7307bc8f..68bfcb6ba7 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -2,105 +2,74 @@ import json import re import time -from base64 import b64encode -from binascii import hexlify -from datetime import datetime from hashlib import md5 from random import randint from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding -from ..compat import compat_urllib_parse_urlencode -from ..networking import Request from ..utils import ( ExtractorError, - bytes_to_intlist, - error_to_compat_str, - float_or_none, int_or_none, - intlist_to_bytes, - try_get, + join_nonempty, + str_or_none, + strftime_or_none, + traverse_obj, + unified_strdate, + url_or_none, + urljoin, + variadic, ) class NetEaseMusicBaseIE(InfoExtractor): _FORMATS = ['bMusic', 'mMusic', 'hMusic'] - _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' _API_BASE = 'http://music.163.com/api/' + _GEO_BYPASS = False - @classmethod - def _encrypt(cls, dfsid): - salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) - string_bytes = bytearray(str(dfsid).encode('ascii')) - salt_len = len(salt_bytes) - for i in range(len(string_bytes)): - string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] - m = md5() - m.update(bytes(string_bytes)) - result = b64encode(m.digest()).decode('ascii') - return result.replace('/', '_').replace('+', '-') + @staticmethod + def kilo_or_none(value): + return int_or_none(value, scale=1000) - def make_player_api_request_data_and_headers(self, song_id, bitrate): - KEY = b'e82ckenh8dichen8' - URL = '/api/song/enhance/player/url' - now = int(time.time() * 1000) - rand = randint(0, 1000) - cookie = { - 'osver': None, - 'deviceId': None, + def _create_eapi_cipher(self, api_path, query_body, cookies): + request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':')) + + message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1') + msg_digest = md5(message).hexdigest() + + data = pkcs7_padding(list(str.encode( + f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}'))) + encrypted = bytes(aes_ecb_encrypt(data, list(b'e82ckenh8dichen8'))) + return f'params={encrypted.hex().upper()}'.encode() + + def _download_eapi_json(self, path, video_id, query_body, headers={}, **kwargs): + cookies = { + 'osver': 'undefined', + 'deviceId': 'undefined', 'appver': '8.0.0', 'versioncode': '140', - 'mobilename': None, + 'mobilename': 'undefined', 'buildver': '1623435496', 'resolution': '1920x1080', '__csrf': '', 'os': 'pc', - 'channel': None, - 'requestId': '{0}_{1:04}'.format(now, rand), + 'channel': 'undefined', + 'requestId': f'{int(time.time() * 1000)}_{randint(0, 1000):04}', + **traverse_obj(self._get_cookies(self._API_BASE), { + 'MUSIC_U': ('MUSIC_U', {lambda i: i.value}), + }) } - request_text = json.dumps( - {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, - separators=(',', ':')) - message = 'nobody{0}use{1}md5forencrypt'.format( - URL, request_text).encode('latin1') - msg_digest = md5(message).hexdigest() - - data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( - URL, request_text, msg_digest) - data = pkcs7_padding(bytes_to_intlist(data)) - encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) - encrypted_params = hexlify(encrypted).decode('ascii').upper() - - cookie = '; '.join( - ['{0}={1}'.format(k, v if v is not None else 'undefined') - for [k, v] in cookie.items()]) - - headers = { - 'User-Agent': self.extractor.get_param('http_headers')['User-Agent'], - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': 'https://music.163.com', - 'Cookie': cookie, - } - return ('params={0}'.format(encrypted_params), headers) + return self._download_json( + urljoin('https://interface3.music.163.com/', f'/eapi{path}'), video_id, + data=self._create_eapi_cipher(f'/api{path}', query_body, cookies), headers={ + 'Referer': 'https://music.163.com', + 'Cookie': '; '.join([f'{k}={v}' for k, v in cookies.items()]), + **headers, + }, **kwargs) def _call_player_api(self, song_id, bitrate): - url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' - data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) - try: - msg = 'empty result' - result = self._download_json( - url, song_id, data=data.encode('ascii'), headers=headers) - if result: - return result - except ExtractorError as e: - if type(e.cause) in (ValueError, TypeError): - # JSON load failure - raise - except Exception as e: - msg = error_to_compat_str(e) - self.report_warning('%s API call (%s) failed: %s' % ( - song_id, bitrate, msg)) - return {} + return self._download_eapi_json( + '/song/enhance/player/url', song_id, {'ids': f'[{song_id}]', 'br': bitrate}, + note=f'Downloading song URL info: bitrate {bitrate}') def extract_formats(self, info): err = 0 @@ -110,45 +79,50 @@ def extract_formats(self, info): details = info.get(song_format) if not details: continue - bitrate = int_or_none(details.get('bitrate')) or 999000 - data = self._call_player_api(song_id, bitrate) - for song in try_get(data, lambda x: x['data'], list) or []: - song_url = try_get(song, lambda x: x['url']) - if not song_url: - continue + for song in traverse_obj(self._call_player_api(song_id, bitrate), ('data', lambda _, v: url_or_none(v['url']))): + song_url = song['url'] if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, - 'ext': details.get('extension'), - 'abr': float_or_none(song.get('br'), scale=1000), 'format_id': song_format, - 'filesize': int_or_none(song.get('size')), - 'asr': int_or_none(details.get('sr')), + 'asr': traverse_obj(details, ('sr', {int_or_none})), + **traverse_obj(song, { + 'ext': ('type', {str}), + 'abr': ('br', {self.kilo_or_none}), + 'filesize': ('size', {int_or_none}), + }), }) elif err == 0: - err = try_get(song, lambda x: x['code'], int) + err = traverse_obj(song, ('code', {int})) or 0 if not formats: - msg = 'No media links found' if err != 0 and (err < 200 or err >= 400): - raise ExtractorError( - '%s (site code %d)' % (msg, err, ), expected=True) + raise ExtractorError(f'No media links found (site code {err})', expected=True) else: self.raise_geo_restricted( - msg + ': probably this video is not available from your location due to geo restriction.', - countries=['CN']) - + 'No media links found: probably due to geo restriction.', countries=['CN']) return formats - @classmethod - def convert_milliseconds(cls, ms): - return int(round(ms / 1000.0)) - def query_api(self, endpoint, video_id, note): - req = Request('%s%s' % (self._API_BASE, endpoint)) - req.headers['Referer'] = self._API_BASE - return self._download_json(req, video_id, note) + result = self._download_json( + f'{self._API_BASE}{endpoint}', video_id, note, headers={'Referer': self._API_BASE}) + code = traverse_obj(result, ('code', {int})) + message = traverse_obj(result, ('message', {str})) or '' + if code == -462: + self.raise_login_required(f'Login required to download: {message}') + elif code != 200: + raise ExtractorError(f'Failed to get meta info: {code} {message}') + return result + + def _get_entries(self, songs_data, entry_keys=None, id_key='id', name_key='name'): + for song in traverse_obj(songs_data, ( + *variadic(entry_keys, (str, bytes, dict, set)), + lambda _, v: int_or_none(v[id_key]) is not None)): + song_id = str(song[id_key]) + yield self.url_result( + f'http://music.163.com/#/song?id={song_id}', NetEaseMusicIE, + song_id, traverse_obj(song, (name_key, {str}))) class NetEaseMusicIE(NetEaseMusicBaseIE): @@ -156,16 +130,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): IE_DESC = '网易云音乐' _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P[0-9]+)' _TESTS = [{ - 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': '3e909614ce09b1ccef4a3eb205441190', + 'url': 'https://music.163.com/#/song?id=548648087', 'info_dict': { - 'id': '32102397', + 'id': '548648087', 'ext': 'mp3', - 'title': 'Bad Blood', - 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150516', - 'timestamp': 1431792000, - 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', + 'title': '戒烟 (Live)', + 'creator': '李荣浩 / 朱正廷 / 陈立农 / 尤长靖 / ONER灵超 / ONER木子洋 / 杨非同 / 陆定昊', + 'timestamp': 1522944000, + 'upload_date': '20180405', + 'description': 'md5:3650af9ee22c87e8637cb2dde22a765c', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + "duration": 256, + 'thumbnail': r're:^http.*\.jpg', }, }, { 'note': 'No lyrics.', @@ -176,21 +152,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': 'Opus 28', 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', - 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', 'timestamp': 1202745600, - }, - }, { - 'note': 'Has translated name.', - 'url': 'http://music.163.com/#/song?id=22735043', - 'info_dict': { - 'id': '22735043', - 'ext': 'mp3', - 'title': '소원을 말해봐 (Genie)', - 'creator': '少女时代', - 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', - 'upload_date': '20100127', - 'timestamp': 1264608000, - 'alt_title': '说出愿望吧(Genie)', + 'duration': 263, + 'thumbnail': r're:^http.*\.jpg', }, }, { 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', @@ -203,59 +167,99 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'upload_date': '19911130', 'timestamp': 691516800, 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 268, + 'alt_title': '伴唱:现代人乐队 合唱:总政歌舞团', + 'thumbnail': r're:^http.*\.jpg', }, + }, { + 'url': 'http://music.163.com/#/song?id=32102397', + 'md5': '3e909614ce09b1ccef4a3eb205441190', + 'info_dict': { + 'id': '32102397', + 'ext': 'mp3', + 'title': 'Bad Blood', + 'creator': 'Taylor Swift / Kendrick Lamar', + 'upload_date': '20150516', + 'timestamp': 1431792000, + 'description': 'md5:21535156efb73d6d1c355f95616e285a', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 199, + 'thumbnail': r're:^http.*\.jpg', + }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Has translated name.', + 'url': 'http://music.163.com/#/song?id=22735043', + 'info_dict': { + 'id': '22735043', + 'ext': 'mp3', + 'title': '소원을 말해봐 (Genie)', + 'creator': '少女时代', + 'upload_date': '20100127', + 'timestamp': 1264608000, + 'description': 'md5:03d1ffebec3139aa4bafe302369269c5', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 229, + 'alt_title': '说出愿望吧(Genie)', + 'thumbnail': r're:^http.*\.jpg', + }, + 'skip': 'Blocked outside Mainland China', }] def _process_lyrics(self, lyrics_info): - original = lyrics_info.get('lrc', {}).get('lyric') - translated = lyrics_info.get('tlyric', {}).get('lyric') + original = traverse_obj(lyrics_info, ('lrc', 'lyric', {str})) + translated = traverse_obj(lyrics_info, ('tlyric', 'lyric', {str})) + + if not original or original == '[99:00.00]纯音乐,请欣赏\n': + return None if not translated: - return original + return { + 'lyrics': [{'data': original, 'ext': 'lrc'}], + } lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = dict( - (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) - ) - lyrics = '\n'.join([ - '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) - for time_stamp, text in original_ts_texts - ]) - return lyrics + translation_ts_dict = dict(re.findall(lyrics_expr, translated)) + + merged = '\n'.join( + join_nonempty(f'{timestamp}{text}', translation_ts_dict.get(timestamp, ''), delim=' / ') + for timestamp, text in original_ts_texts) + + return { + 'lyrics_merged': [{'data': merged, 'ext': 'lrc'}], + 'lyrics': [{'data': original, 'ext': 'lrc'}], + 'lyrics_translated': [{'data': translated, 'ext': 'lrc'}], + } def _real_extract(self, url): song_id = self._match_id(url) - params = { - 'id': song_id, - 'ids': '[%s]' % song_id - } info = self.query_api( - 'song/detail?' + compat_urllib_parse_urlencode(params), - song_id, 'Downloading song info')['songs'][0] + f'song/detail?id={song_id}&ids=%5B{song_id}%5D', song_id, 'Downloading song info')['songs'][0] formats = self.extract_formats(info) - lyrics_info = self.query_api( - 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, - song_id, 'Downloading lyrics data') - lyrics = self._process_lyrics(lyrics_info) - - alt_title = None - if info.get('transNames'): - alt_title = '/'.join(info.get('transNames')) + lyrics = self._process_lyrics(self.query_api( + f'song/lyric?id={song_id}&lv=-1&tv=-1', song_id, 'Downloading lyrics data')) + lyric_data = { + 'description': traverse_obj(lyrics, (('lyrics_merged', 'lyrics'), 0, 'data'), get_all=False), + 'subtitles': lyrics, + } if lyrics else {} return { 'id': song_id, - 'title': info['name'], - 'alt_title': alt_title, - 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), - 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), - 'thumbnail': info.get('album', {}).get('picUrl'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'description': lyrics, 'formats': formats, + 'alt_title': '/'.join(traverse_obj(info, (('transNames', 'alias'), ...))) or None, + 'creator': ' / '.join(traverse_obj(info, ('artists', ..., 'name'))) or None, + **lyric_data, + **traverse_obj(info, { + 'title': ('name', {str}), + 'timestamp': ('album', 'publishTime', {self.kilo_or_none}), + 'thumbnail': ('album', 'picUrl', {url_or_none}), + 'duration': ('duration', {self.kilo_or_none}), + }), } @@ -263,31 +267,44 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): IE_NAME = 'netease:album' IE_DESC = '网易云音乐 - 专辑' _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P[0-9]+)' - _TEST = { + _TESTS = [{ + 'url': 'https://music.163.com/#/album?id=133153666', + 'info_dict': { + 'id': '133153666', + 'title': '桃几的翻唱', + 'upload_date': '20210913', + 'description': '桃几2021年翻唱合集', + 'thumbnail': r're:^http.*\.jpg', + }, + 'playlist_mincount': 13, + }, { 'url': 'http://music.163.com/#/album?id=220780', 'info_dict': { 'id': '220780', - 'title': 'B\'day', + 'title': 'B\'Day', + 'upload_date': '20060904', + 'description': 'md5:71a74e1d8f392d88cf1bbe48879ad0b0', + 'thumbnail': r're:^http.*\.jpg', }, 'playlist_count': 23, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): album_id = self._match_id(url) + webpage = self._download_webpage(f'https://music.163.com/album?id={album_id}', album_id) - info = self.query_api( - 'album/%s?id=%s' % (album_id, album_id), - album_id, 'Downloading album data')['album'] - - name = info['name'] - desc = info.get('description') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['songs'] - ] - return self.playlist_result(entries, album_id, name, desc) + songs = self._search_json( + r']+\bid="song-list-pre-data"[^>]*>', webpage, 'metainfo', album_id, + end_pattern=r'', contains_pattern=r'\[(?s:.+)\]') + metainfo = { + 'title': self._og_search_property('title', webpage, 'title', fatal=False), + 'description': self._html_search_regex( + (rf']+\bid="album-desc-{suffix}"[^>]*>(.*?)' for suffix in ('more', 'dot')), + webpage, 'description', flags=re.S, fatal=False), + 'thumbnail': self._og_search_property('image', webpage, 'thumbnail', fatal=False), + 'upload_date': unified_strdate(self._html_search_meta('music:release_date', webpage, 'date', fatal=False)), + } + return self.playlist_result(self._get_entries(songs), album_id, **metainfo) class NetEaseMusicSingerIE(NetEaseMusicBaseIE): @@ -299,10 +316,9 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'url': 'http://music.163.com/#/artist?id=10559', 'info_dict': { 'id': '10559', - 'title': '张惠妹 - aMEI;阿密特', + 'title': '张惠妹 - aMEI;阿妹;阿密特', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Singer has translated name.', 'url': 'http://music.163.com/#/artist?id=124098', @@ -311,28 +327,28 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'title': '李昇基 - 이승기', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Singer with both translated and alias', + 'url': 'https://music.163.com/#/artist?id=159692', + 'info_dict': { + 'id': '159692', + 'title': '初音ミク - 初音未来;Hatsune Miku', + }, + 'playlist_count': 50, }] def _real_extract(self, url): singer_id = self._match_id(url) info = self.query_api( - 'artist/%s?id=%s' % (singer_id, singer_id), - singer_id, 'Downloading singer data') + f'artist/{singer_id}?id={singer_id}', singer_id, note='Downloading singer data') - name = info['artist']['name'] - if info['artist']['trans']: - name = '%s - %s' % (name, info['artist']['trans']) - if info['artist']['alias']: - name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) + name = join_nonempty( + traverse_obj(info, ('artist', 'name', {str})), + join_nonempty(*traverse_obj(info, ('artist', ('trans', ('alias', ...)), {str})), delim=';'), + delim=' - ') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['hotSongs'] - ] - return self.playlist_result(entries, singer_id, name) + return self.playlist_result(self._get_entries(info, 'hotSongs'), singer_id, name) class NetEaseMusicListIE(NetEaseMusicBaseIE): @@ -344,10 +360,28 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): 'info_dict': { 'id': '79177352', 'title': 'Billboard 2007 Top 100', - 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' + 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022', + 'tags': ['欧美'], + 'uploader': '浑然破灭', + 'uploader_id': '67549805', + 'timestamp': int, + 'upload_date': r're:\d{8}', }, - 'playlist_count': 99, - 'skip': 'Blocked outside Mainland China', + 'playlist_mincount': 95, + }, { + 'note': 'Toplist/Charts sample', + 'url': 'https://music.163.com/#/discover/toplist?id=60198', + 'info_dict': { + 'id': '60198', + 'title': 're:美国Billboard榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'description': '美国Billboard排行榜', + 'tags': ['流行', '欧美', '榜单'], + 'uploader': 'Billboard公告牌', + 'uploader_id': '48171', + 'timestamp': int, + 'upload_date': r're:\d{8}', + }, + 'playlist_count': 100, }, { 'note': 'Toplist/Charts sample', 'url': 'http://music.163.com/#/discover/toplist?id=3733003', @@ -363,64 +397,86 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - info = self.query_api( - 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, - list_id, 'Downloading playlist data')['result'] + info = self._download_eapi_json( + '/v3/playlist/detail', list_id, + {'id': list_id, 't': '-1', 'n': '500', 's': '0'}, + note="Downloading playlist info") - name = info['name'] - desc = info.get('description') + metainfo = traverse_obj(info, ('playlist', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'tags': ('tags', ..., {str}), + 'uploader': ('creator', 'nickname', {str}), + 'uploader_id': ('creator', 'userId', {str_or_none}), + 'timestamp': ('updateTime', {self.kilo_or_none}), + })) + if traverse_obj(info, ('playlist', 'specialType')) == 10: + metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}' - if info.get('specialType') == 10: # is a chart/toplist - datestamp = datetime.fromtimestamp( - self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') - name = '%s %s' % (name, datestamp) - - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['tracks'] - ] - return self.playlist_result(entries, list_id, name, desc) + return self.playlist_result(self._get_entries(info, ('playlist', 'tracks')), list_id, **metainfo) class NetEaseMusicMvIE(NetEaseMusicBaseIE): IE_NAME = 'netease:mv' IE_DESC = '网易云音乐 - MV' _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P[0-9]+)' - _TEST = { + _TESTS = [{ + 'url': 'https://music.163.com/#/mv?id=10958064', + 'info_dict': { + 'id': '10958064', + 'ext': 'mp4', + 'title': '交换余生', + 'description': 'md5:e845872cff28820642a2b02eda428fea', + 'creator': '林俊杰', + 'upload_date': '20200916', + 'thumbnail': r're:http.*\.jpg', + 'duration': 364, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }, { 'url': 'http://music.163.com/#/mv?id=415350', 'info_dict': { 'id': '415350', 'ext': 'mp4', 'title': '이럴거면 그러지말지', 'description': '白雅言自作曲唱甜蜜爱情', - 'creator': '白雅言', + 'creator': '白娥娟', 'upload_date': '20150520', + 'thumbnail': r're:http.*\.jpg', + 'duration': 216, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): mv_id = self._match_id(url) info = self.query_api( - 'mv/detail?id=%s&type=mp4' % mv_id, - mv_id, 'Downloading mv info')['data'] + f'mv/detail?id={mv_id}&type=mp4', mv_id, 'Downloading mv info')['data'] formats = [ - {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} + {'url': mv_url, 'ext': 'mp4', 'format_id': f'{brs}p', 'height': int_or_none(brs)} for brs, mv_url in info['brs'].items() ] return { 'id': mv_id, - 'title': info['name'], - 'description': info.get('desc') or info.get('briefDesc'), - 'creator': info['artistName'], - 'upload_date': info['publishTime'].replace('-', ''), 'formats': formats, - 'thumbnail': info.get('cover'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), + **traverse_obj(info, { + 'title': ('name', {str}), + 'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}), + 'creator': ('artistName', {str}), + 'upload_date': ('publishTime', {unified_strdate}), + 'thumbnail': ('cover', {url_or_none}), + 'duration': ('duration', {self.kilo_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'like_count': ('likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), } @@ -431,75 +487,74 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): _TESTS = [{ 'url': 'http://music.163.com/#/program?id=10109055', 'info_dict': { - 'id': '10109055', + 'id': '32593346', 'ext': 'mp3', 'title': '不丹足球背后的故事', 'description': '喜马拉雅人的足球梦 ...', 'creator': '大话西藏', - 'timestamp': 1434179342, + 'timestamp': 1434179287, 'upload_date': '20150613', + 'thumbnail': r're:http.*\.jpg', 'duration': 900, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { 'id': '10141022', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + 'creator': '滚滚电台ORZ', + 'timestamp': 1434450733, + 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', }, 'playlist_count': 4, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { - 'id': '10141022', + 'id': '32647209', 'ext': 'mp3', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'timestamp': 1434450841, + 'creator': '滚滚电台ORZ', + 'timestamp': 1434450733, 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', + 'duration': 1104, }, 'params': { 'noplaylist': True }, - 'skip': 'Blocked outside Mainland China', }] def _real_extract(self, url): program_id = self._match_id(url) info = self.query_api( - 'dj/program/detail?id=%s' % program_id, - program_id, 'Downloading program info')['program'] + f'dj/program/detail?id={program_id}', program_id, note='Downloading program info')['program'] - name = info['name'] - description = info['description'] + metainfo = traverse_obj(info, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'creator': ('dj', 'brand', {str}), + 'thumbnail': ('coverUrl', {url_or_none}), + 'timestamp': ('createTime', {self.kilo_or_none}), + }) if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): formats = self.extract_formats(info['mainSong']) return { - 'id': info['mainSong']['id'], - 'title': name, - 'description': description, - 'creator': info['dj']['brand'], - 'timestamp': self.convert_milliseconds(info['createTime']), - 'thumbnail': info['coverUrl'], - 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'id': str(info['mainSong']['id']), 'formats': formats, + 'duration': traverse_obj(info, ('mainSong', 'duration', {self.kilo_or_none})), + **metainfo, } - song_ids = [info['mainSong']['id']] - song_ids.extend([song['id'] for song in info['songs']]) - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song_id, - 'NetEaseMusic', song_id) - for song_id in song_ids - ] - return self.playlist_result(entries, program_id, name, description) + songs = traverse_obj(info, (('mainSong', ('songs', ...)),)) + return self.playlist_result(self._get_entries(songs), program_id, **metainfo) class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): @@ -511,38 +566,32 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): 'info_dict': { 'id': '42', 'title': '声音蔓延', - 'description': 'md5:766220985cbd16fdd552f64c578a6b15' + 'description': 'md5:c7381ebd7989f9f367668a5aee7d5f08' }, 'playlist_mincount': 40, - 'skip': 'Blocked outside Mainland China', } _PAGE_SIZE = 1000 def _real_extract(self, url): dj_id = self._match_id(url) - name = None - desc = None + metainfo = {} entries = [] for offset in itertools.count(start=0, step=self._PAGE_SIZE): info = self.query_api( - 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' - % (self._PAGE_SIZE, dj_id, offset), - dj_id, 'Downloading dj programs - %d' % offset) + f'dj/program/byradio?asc=false&limit={self._PAGE_SIZE}&radioId={dj_id}&offset={offset}', + dj_id, note=f'Downloading dj programs - {offset}') - entries.extend([ - self.url_result( - 'http://music.163.com/#/program?id=%s' % program['id'], - 'NetEaseMusicProgram', program['id']) - for program in info['programs'] - ]) - - if name is None: - radio = info['programs'][0]['radio'] - name = radio['name'] - desc = radio['desc'] + entries.extend(self.url_result( + f'http://music.163.com/#/program?id={program["id"]}', NetEaseMusicProgramIE, + program['id'], program.get('name')) for program in info['programs']) + if not metainfo: + metainfo = traverse_obj(info, ('programs', 0, 'radio', { + 'title': ('name', {str}), + 'description': ('desc', {str}), + })) if not info['more']: break - return self.playlist_result(entries, dj_id, name, desc) + return self.playlist_result(entries, dj_id, **metainfo) From a9efb4b8d74f3583450ffda0ee57259a47d39c70 Mon Sep 17 00:00:00 2001 From: xofe <22776566+xofe@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:35:11 +0000 Subject: [PATCH 433/501] [ie/abc.net.au:iview] Improve `episode` extraction (#8201) Authored by: xofe --- yt_dlp/extractor/abc.py | 90 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index d2cf5f7c51..9d527246a1 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -181,18 +181,102 @@ class ABCIViewIE(InfoExtractor): _GEO_COUNTRIES = ['AU'] _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/utopia/series/1/video/CO1211V001S00', + 'md5': '52a942bfd7a0b79a6bfe9b4ce6c9d0ed', + 'info_dict': { + 'id': 'CO1211V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 Wood For The Trees', + 'series': 'Utopia', + 'description': 'md5:0cfb2c183c1b952d1548fd65c8a95c00', + 'upload_date': '20230726', + 'uploader_id': 'abc1', + 'series_id': 'CO1211V', + 'episode_id': 'CO1211V001S00', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'Wood For The Trees', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/co/CO1211V001S00_5ad8353f4df09_1280.jpg', + 'timestamp': 1690403700, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode name', 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', 'md5': '67715ce3c78426b11ba167d875ac6abf', 'info_dict': { 'id': 'LE1927H001S00', 'ext': 'mp4', - 'title': "Series 11 Ep 1", - 'series': "Gruen", + 'title': 'Series 11 Ep 1', + 'series': 'Gruen', 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', 'upload_date': '20190925', 'uploader_id': 'abc1', + 'series_id': 'LE1927H', + 'episode_id': 'LE1927H001S00', + 'season_number': 11, + 'season': 'Season 11', + 'episode_number': 1, + 'episode': 'Episode 1', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/le/LE1927H001S00_5d954fbd79e25_1280.jpg', 'timestamp': 1569445289, }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode number', + 'url': 'https://iview.abc.net.au/show/four-corners/series/2022/video/NC2203H039S00', + 'md5': '77cb7d8434440e3b28fbebe331c2456a', + 'info_dict': { + 'id': 'NC2203H039S00', + 'ext': 'mp4', + 'title': 'Series 2022 Locking Up Kids', + 'series': 'Four Corners', + 'description': 'md5:54829ca108846d1a70e1fcce2853e720', + 'upload_date': '20221114', + 'uploader_id': 'abc1', + 'series_id': 'NC2203H', + 'episode_id': 'NC2203H039S00', + 'season_number': 2022, + 'season': 'Season 2022', + 'episode_number': None, + 'episode': 'Locking Up Kids', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/nc/NC2203H039S00_636d8a0944a22_1920.jpg', + 'timestamp': 1668460497, + + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode name or number', + 'url': 'https://iview.abc.net.au/show/landline/series/2021/video/RF2004Q043S00', + 'md5': '2e17dec06b13cc81dc119d2565289396', + 'info_dict': { + 'id': 'RF2004Q043S00', + 'ext': 'mp4', + 'title': 'Series 2021', + 'series': 'Landline', + 'description': 'md5:c9f30d9c0c914a7fd23842f6240be014', + 'upload_date': '20211205', + 'uploader_id': 'abc1', + 'series_id': 'RF2004Q', + 'episode_id': 'RF2004Q043S00', + 'season_number': 2021, + 'season': 'Season 2021', + 'episode_number': None, + 'episode': None, + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/rf/RF2004Q043S00_61a950639dbc0_1920.jpg', + 'timestamp': 1638710705, + + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], 'params': { 'skip_download': True, }, @@ -254,6 +338,8 @@ def tokenize_url(url, token): 'episode_number': int_or_none(self._search_regex( r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), 'episode_id': house_number, + 'episode': self._search_regex( + r'^(?:Series\s+\d+)?\s*(?:Ep\s+\d+)?\s*(.*)$', title, 'episode', default='') or None, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, From 48cceec1ddb8649b5e771df8df79eb9c39c82b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 6 Oct 2023 19:38:26 -0300 Subject: [PATCH 434/501] [ie/lbry] Add playlist support (#8213) Closes #5982, Closes #8204 Authored by: drzraf, bashonly, Grub4K --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/lbry.py | 184 ++++++++++++++++++++------------ 2 files changed, 116 insertions(+), 69 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 908abb8ace..ef6123e8a7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -951,6 +951,7 @@ from .lbry import ( LBRYIE, LBRYChannelIE, + LBRYPlaylistIE, ) from .lci import LCIIE from .lcp import ( diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 9a9f9256fe..ccce300b5b 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -22,10 +22,11 @@ class LBRYBaseIE(InfoExtractor): - _BASE_URL_REGEX = r'(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' + _BASE_URL_REGEX = r'(?x)(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' - _OPT_CLAIM_ID = '[^:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX + _OPT_CLAIM_ID = '[^$@:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + _PAGE_SIZE = 50 def _call_api_proxy(self, method, display_id, params, resource): headers = {'Content-Type': 'application/json-rpc'} @@ -77,10 +78,70 @@ def _parse_stream(self, stream, url): return info + def _fetch_page(self, display_id, url, params, page): + page += 1 + page_params = { + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + **params, + } + result = self._call_api_proxy( + 'claim_search', display_id, page_params, f'page {page}') + for item in traverse_obj(result, ('items', lambda _, v: v['name'] and v['claim_id'])): + yield { + **self._parse_stream(item, url), + '_type': 'url', + 'id': item['claim_id'], + 'url': self._permanent_url(url, item['name'], item['claim_id']), + } + + def _playlist_entries(self, url, display_id, claim_param, metadata): + qs = parse_qs(url) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'claim_type': 'stream', + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + **claim_param, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, display_id, url, params), + self._PAGE_SIZE) + + return self.playlist_result( + entries, display_id, **traverse_obj(metadata, ('value', { + 'title': 'title', + 'description': 'description', + }))) + class LBRYIE(LBRYBaseIE): IE_NAME = 'lbry' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf''' + (?:\$/(?:download|embed)/)? + (?P + [^$@:/?#]+/{LBRYBaseIE._CLAIM_ID_REGEX} + |(?:@{LBRYBaseIE._OPT_CLAIM_ID}/)?{LBRYBaseIE._OPT_CLAIM_ID} + )''' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -149,7 +210,7 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', + 'formats': 'mincount:3', # FIXME 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', } @@ -184,12 +245,12 @@ class LBRYIE(LBRYBaseIE): 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', 'ext': 'mp4', 'title': 'Biotechnological Invasion of Skin (April 2023)', - 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'description': 'md5:fe28689db2cb7ba3436d819ac3ffc378', 'channel': 'Wicked Truths', 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', - 'timestamp': 1685790036, - 'upload_date': '20230603', + 'timestamp': 1695114347, + 'upload_date': '20230919', 'release_timestamp': 1685617473, 'release_date': '20230601', 'duration': 1063, @@ -229,10 +290,10 @@ class LBRYIE(LBRYBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - if display_id.startswith('$/'): - display_id = display_id.split('/', 2)[-1].replace('/', ':') - else: + if display_id.startswith('@'): display_id = display_id.replace(':', '#') + else: + display_id = display_id.replace('/', ':') display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') @@ -299,7 +360,7 @@ def _real_extract(self, url): class LBRYChannelIE(LBRYBaseIE): IE_NAME = 'lbry:channel' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P@%s)/?(?:[?&]|$)' % LBRYBaseIE._OPT_CLAIM_ID + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)' _TESTS = [{ 'url': 'https://lbry.tv/@LBRYFoundation:0', 'info_dict': { @@ -315,65 +376,50 @@ class LBRYChannelIE(LBRYBaseIE): 'url': 'lbry://@lbry#3f', 'only_matching': True, }] - _PAGE_SIZE = 50 - - def _fetch_page(self, claim_id, url, params, page): - page += 1 - page_params = { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - } - page_params.update(params) - result = self._call_api_proxy( - 'claim_search', claim_id, page_params, 'page %d' % page) - for item in (result.get('items') or []): - stream_claim_name = item.get('name') - stream_claim_id = item.get('claim_id') - if not (stream_claim_name and stream_claim_id): - continue - - yield { - **self._parse_stream(item, url), - '_type': 'url', - 'id': stream_claim_id, - 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') - result = self._resolve_url( - 'lbry://' + display_id, display_id, 'channel') + result = self._resolve_url(f'lbry://{display_id}', display_id, 'channel') claim_id = result['claim_id'] - qs = parse_qs(url) - content = qs.get('content', [None])[0] - params = { - 'fee_amount': qs.get('fee_amount', ['>=0'])[0], - 'order_by': { - 'new': ['release_time'], - 'top': ['effective_amount'], - 'trending': ['trending_group', 'trending_mixed'], - }[qs.get('order', ['new'])[0]], - 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, - } - duration = qs.get('duration', [None])[0] - if duration: - params['duration'] = { - 'long': '>=1200', - 'short': '<=240', - }[duration] - language = qs.get('language', ['all'])[0] - if language != 'all': - languages = [language] - if language == 'en': - languages.append('none') - params['any_languages'] = languages - entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url, params), - self._PAGE_SIZE) - result_value = result.get('value') or {} - return self.playlist_result( - entries, claim_id, result_value.get('title'), - result_value.get('description')) + + return self._playlist_entries(url, claim_id, {'channel_ids': [claim_id]}, result) + + +class LBRYPlaylistIE(LBRYBaseIE): + IE_NAME = 'lbry:playlist' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P[0-9a-f-]+)' + _TESTS = [{ + 'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'info_dict': { + 'id': 'ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'title': 'Théâtre Classique', + 'description': 'Théâtre Classique', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://odysee.com/$/list/9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'info_dict': { + 'id': '9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'title': 'Social Media Exposed', + 'description': 'md5:98af97317aacd5b85d595775ea37d80e', + }, + 'playlist_mincount': 34, + }, { + 'url': 'https://odysee.com/$/playlist/938fb11d-215f-4d1c-ad64-723954df2184', + 'info_dict': { + 'id': '938fb11d-215f-4d1c-ad64-723954df2184', + }, + 'playlist_mincount': 1000, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + result = traverse_obj(self._call_api_proxy('claim_search', display_id, { + 'claim_ids': [display_id], + 'no_totals': True, + 'page': 1, + 'page_size': self._PAGE_SIZE, + }, 'playlist'), ('items', 0)) + claim_param = {'claim_ids': traverse_obj(result, ('value', 'claims', ..., {str}))} + + return self._playlist_entries(url, display_id, claim_param, result) From fbcc299bd8a19cf8b3c8805d6c268a9110230973 Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Sat, 7 Oct 2023 01:45:46 +0300 Subject: [PATCH 435/501] [ie/substack] Fix embed extraction (#8218) Authored by: handlerug --- yt_dlp/extractor/substack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 3782ceed1c..5835a5a8d3 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -50,7 +50,7 @@ def _extract_embed_urls(cls, url, webpage): if not re.search(r']+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): return - mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P[^"]+)', webpage) + mobj = re.search(r'{[^}]*\\?["\']subdomain\\?["\']\s*:\s*\\?["\'](?P[^\\"\']+)', webpage) if mobj: parsed = urllib.parse.urlparse(url) yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() From 2f2dda3a7e85148773da3cdbc03ac9949ec1bc45 Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Sat, 7 Oct 2023 01:48:54 +0300 Subject: [PATCH 436/501] [ie/substack] Fix download cookies bug (#8219) Authored by: handlerug --- yt_dlp/extractor/substack.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 5835a5a8d3..6ee3f75e1a 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -56,10 +56,10 @@ def _extract_embed_urls(cls, url, webpage): yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() raise cls.StopExtraction() - def _extract_video_formats(self, video_id, username): + def _extract_video_formats(self, video_id, url): formats, subtitles = [], {} for video_format in ('hls', 'mp4'): - video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}' + video_url = urllib.parse.urljoin(url, f'/api/v1/video/upload/{video_id}/src?type={video_format}') if video_format == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False) @@ -81,12 +81,17 @@ def _real_extract(self, url): r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) + canonical_url = url + domain = traverse_obj(webpage_info, ('domainInfo', 'customDomain', {str})) + if domain: + canonical_url = urllib.parse.urlparse(url)._replace(netloc=domain).geturl() + post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} elif post_type == 'video': - formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username) + formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: self.raise_no_formats(f'Page type "{post_type}" is not supported') @@ -99,4 +104,5 @@ def _real_extract(self, url): 'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')), 'uploader': traverse_obj(webpage_info, ('pub', 'name')), 'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))), + 'webpage_url': canonical_url, } From 2ad3873f0dfa9285c91d2160e36c039e69d597c7 Mon Sep 17 00:00:00 2001 From: garret Date: Fri, 6 Oct 2023 23:53:11 +0100 Subject: [PATCH 437/501] [ie/radiko] Improve extraction (#8221) Authored by: garret1317 --- yt_dlp/extractor/radiko.py | 67 ++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index cef68eba08..8c8fb1a8f9 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -1,4 +1,5 @@ import base64 +import random import urllib.parse from .common import InfoExtractor @@ -13,6 +14,7 @@ class RadikoBaseIE(InfoExtractor): + _GEO_BYPASS = False _FULL_KEY = None _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = ( 'https://c-rpaa.smartstream.ne.jp', @@ -32,7 +34,7 @@ class RadikoBaseIE(InfoExtractor): 'https://c-radiko.smartstream.ne.jp', ) - def _auth_client(self): + def _negotiate_token(self): _, auth1_handle = self._download_webpage_handle( 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', headers={ @@ -58,10 +60,23 @@ def _auth_client(self): 'x-radiko-partialkey': partial_key, }).split(',')[0] + if area_id == 'OUT': + self.raise_geo_restricted(countries=['JP']) + auth_data = (auth_token, area_id) self.cache.store('radiko', 'auth_data', auth_data) return auth_data + def _auth_client(self): + cachedata = self.cache.load('radiko', 'auth_data') + if cachedata is not None: + response = self._download_webpage( + 'https://radiko.jp/v2/api/auth_check', None, 'Checking cached token', expected_status=401, + headers={'X-Radiko-AuthToken': cachedata[0], 'X-Radiko-AreaId': cachedata[1]}) + if response == 'OK': + return cachedata + return self._negotiate_token() + def _extract_full_key(self): if self._FULL_KEY: return self._FULL_KEY @@ -75,7 +90,7 @@ def _extract_full_key(self): if full_key: full_key = full_key.encode() - else: # use full key ever known + else: # use only full key ever known full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa' self._FULL_KEY = full_key @@ -103,24 +118,24 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, m3u8_playlist_data = self._download_xml( f'https://radiko.jp/v3/station/stream/pc_html5/{station}.xml', video_id, note='Downloading stream information') - m3u8_urls = m3u8_playlist_data.findall('.//url') formats = [] found = set() - for url_tag in m3u8_urls: - pcu = url_tag.find('playlist_create_url').text - url_attrib = url_tag.attrib + + timefree_int = 0 if is_onair else 1 + + for element in m3u8_playlist_data.findall(f'.//url[@timefree="{timefree_int}"]/playlist_create_url'): + pcu = element.text + if pcu in found: + continue + found.add(pcu) playlist_url = update_url_query(pcu, { 'station_id': station, **query, 'l': '15', - 'lsid': '88ecea37e968c1f17d5413312d9f8003', + 'lsid': ''.join(random.choices('0123456789abcdef', k=32)), 'type': 'b', }) - if playlist_url in found: - continue - else: - found.add(playlist_url) time_to_skip = None if is_onair else cursor - ft @@ -138,7 +153,7 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)): sf['preference'] = -100 sf['format_note'] = 'not preferred' - if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: + if not is_onair and timefree_int == 1 and time_to_skip: sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} formats.extend(subformats) @@ -166,21 +181,7 @@ def _real_extract(self, url): vid_int = unified_timestamp(video_id, False) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) - auth_cache = self.cache.load('radiko', 'auth_data') - for attempt in range(2): - auth_token, area_id = (not attempt and auth_cache) or self._auth_client() - formats = self._extract_formats( - video_id=video_id, station=station, is_onair=False, - ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, - query={ - 'start_at': radio_begin, - 'ft': radio_begin, - 'end_at': radio_end, - 'to': radio_end, - 'seek': video_id, - }) - if formats: - break + auth_token, area_id = self._auth_client() return { 'id': video_id, @@ -189,8 +190,18 @@ def _real_extract(self, url): 'uploader': try_call(lambda: station_program.find('.//name').text), 'uploader_id': station, 'timestamp': vid_int, - 'formats': formats, 'is_live': True, + 'formats': self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id + } + ), } From 35d9cbaf9638ccc9daf8a863063b2e7c135bc664 Mon Sep 17 00:00:00 2001 From: AS6939 <46506352+AS6939@users.noreply.github.com> Date: Sat, 7 Oct 2023 06:56:12 +0800 Subject: [PATCH 438/501] [ie/iq.com] Fix extraction and subtitles (#8260) Closes #7734, Closes #8123 Authored by: AS6939 --- yt_dlp/extractor/iqiyi.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index fa602ba887..3368ab1d93 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -499,9 +499,10 @@ class IqIE(InfoExtractor): 'tm': tm, 'qdy': 'a', 'qds': 0, - 'k_ft1': 141287244169348, - 'k_ft4': 34359746564, - 'k_ft5': 1, + 'k_ft1': '143486267424900', + 'k_ft4': '1572868', + 'k_ft7': '4', + 'k_ft5': '1', 'bop': JSON.stringify({ 'version': '10.0', 'dfp': dfp @@ -529,14 +530,22 @@ def _extract_vms_player_js(self, webpage, video_id): webpack_js_url = self._proto_relative_url(self._search_regex( r'') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + data.get('file_url') or data['stream_url'], video_id, 'm4a', m3u8_id='hls'), + 'age_limit': 18, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'release_timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'id', {str_or_none}), + 'uploader_url': ('user', 'permalink_url', {url_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('plays', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'webpage_url': ('permalink_url', {url_or_none}), + }), + } From 0e722f2f3ca42e634fd7b06ee70b16bf833ce132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 6 Oct 2023 19:59:42 -0300 Subject: [PATCH 440/501] [ie/lbry] Extract `uploader_id` (#8244) Closes #123 Authored by: drzraf --- yt_dlp/extractor/lbry.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index ccce300b5b..cc37c41e8c 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -70,11 +70,11 @@ def _parse_stream(self, stream, url): 'duration': ('value', stream_type, 'duration', {int_or_none}), 'channel': ('signing_channel', 'value', 'title', {str}), 'channel_id': ('signing_channel', 'claim_id', {str}), + 'uploader_id': ('signing_channel', 'name', {str}), }) - channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) - if channel_name and info.get('channel_id'): - info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) + if info.get('uploader_id') and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, info['uploader_id'], info['channel_id']) return info @@ -159,6 +159,7 @@ class LBRYIE(LBRYBaseIE): 'height': 720, 'thumbnail': 'https://spee.ch/7/67f2d809c263288c.png', 'license': 'None', + 'uploader_id': '@Mantega', 'duration': 346, 'channel': 'LBRY/Odysee rats united!!!', 'channel_id': '1c8ad6a2ab4e889a71146ae4deeb23bb92dab627', @@ -192,6 +193,7 @@ class LBRYIE(LBRYBaseIE): 'vcodec': 'none', 'thumbnail': 'https://spee.ch/d/0bc63b0e6bf1492d.png', 'license': 'None', + 'uploader_id': '@LBRYFoundation', } }, { 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', @@ -210,7 +212,8 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', # FIXME + 'uploader_id': '@gardeningincanada', + 'formats': 'mincount:3', 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', } @@ -235,6 +238,7 @@ class LBRYIE(LBRYBaseIE): 'formats': 'mincount:1', 'thumbnail': 'startswith:https://thumb', 'license': 'None', + 'uploader_id': '@RT', }, 'params': {'skip_download': True} }, { @@ -249,6 +253,7 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Wicked Truths', 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'uploader_id': '@wickedtruths', 'timestamp': 1695114347, 'upload_date': '20230919', 'release_timestamp': 1685617473, From e831c80e8b2fc025b3b67d82974cc59e3526fdc8 Mon Sep 17 00:00:00 2001 From: garret Date: Sat, 7 Oct 2023 00:05:48 +0100 Subject: [PATCH 441/501] [ie/nhk] Fix VOD extraction (#8249) Closes #8242 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index fbd6a18f6d..bcbc2279f6 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -28,6 +28,44 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip): m_id, lang, '/all' if is_video else ''), m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] + def _get_api_info(self, refresh=True): + if not refresh: + return self.cache.load('nhk', 'api_info') + + self.cache.store('nhk', 'api_info', {}) + movie_player_js = self._download_webpage( + 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None, + note='Downloading stream API information') + api_info = { + 'url': self._search_regex( + r'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'), + 'token': self._search_regex( + r'prod:[^;]+\btoken:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API token'), + } + self.cache.store('nhk', 'api_info', api_info) + return api_info + + def _extract_formats_and_subtitles(self, vod_id): + for refresh in (False, True): + api_info = self._get_api_info(refresh) + if not api_info: + continue + + api_url = api_info.pop('url') + stream_url = traverse_obj( + self._download_json( + api_url, vod_id, 'Downloading stream url info', fatal=False, query={ + **api_info, + 'type': 'json', + 'optional_id': vod_id, + 'active_flg': 1, + }), + ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) + if stream_url: + return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + + raise ExtractorError('Unable to extract stream url') + def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() @@ -67,12 +105,14 @@ def get_clean_field(key): } if is_video: vod_id = episode['vod_id'] + formats, subs = self._extract_formats_and_subtitles(vod_id) + info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id, 'id': vod_id, + 'formats': formats, + 'subtitles': subs, }) + else: if fetch_episode: audio_path = episode['audio']['audio'] From 19c90e405b4137c06dfe6f9aaa02396df0da93e5 Mon Sep 17 00:00:00 2001 From: trainman261 Date: Sat, 7 Oct 2023 01:56:19 +0200 Subject: [PATCH 442/501] [cleanup] Update extractor tests (#7718) Authored by: trainman261 --- yt_dlp/extractor/aenetworks.py | 1 + yt_dlp/extractor/amcnetworks.py | 1 + yt_dlp/extractor/cbc.py | 7 ++++++- yt_dlp/extractor/cbs.py | 2 ++ yt_dlp/extractor/cnbc.py | 2 ++ yt_dlp/extractor/corus.py | 3 ++- yt_dlp/extractor/generic.py | 13 ++++++++++--- yt_dlp/extractor/mediaset.py | 3 ++- yt_dlp/extractor/movieclips.py | 1 + yt_dlp/extractor/nationalgeographic.py | 3 +++ yt_dlp/extractor/nbc.py | 22 +++++++++++++++++----- yt_dlp/extractor/scrippsnetworks.py | 4 ++++ yt_dlp/extractor/syfy.py | 1 + yt_dlp/extractor/theplatform.py | 6 +++--- yt_dlp/extractor/theweatherchannel.py | 20 +++++++++++--------- 15 files changed, 66 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index f049a0fb3c..cc26653c1d 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -338,6 +338,7 @@ class BiographyIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': '404 Not Found', }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/amcnetworks.py b/yt_dlp/extractor/amcnetworks.py index c58bc7bfbf..10bd021c55 100644 --- a/yt_dlp/extractor/amcnetworks.py +++ b/yt_dlp/extractor/amcnetworks.py @@ -26,6 +26,7 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE # m3u8 download 'skip_download': True, }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 2920b9027d..be2d13e442 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -66,6 +66,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', 'timestamp': 255977160, }, + 'skip': '404 Not Found', }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', @@ -97,7 +98,7 @@ class CBCIE(InfoExtractor): # multiple CBC.APP.Caffeine.initInstance(...) 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', 'info_dict': { - 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', # FIXME 'id': 'dog-indoor-exercise-winter-1.3928238', 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, @@ -476,6 +477,10 @@ class CBCGemPlaylistIE(InfoExtractor): 'id': 'schitts-creek/s06', 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', + 'series': 'Schitt\'s Creek', + 'season_number': 6, + 'season': 'Season 6', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, }, { 'url': 'https://gem.cbc.ca/schitts-creek/s06', diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 1c0dbdea94..d97fbd758c 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -101,6 +101,7 @@ class CBSIE(CBSBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'Subscription required', }, { 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { @@ -117,6 +118,7 @@ class CBSIE(CBSBaseIE): }, 'expected_warnings': [ 'This content expired on', 'No video formats found', 'Requested format is not available'], + 'skip': '404 Not Found', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', 'only_matching': True, diff --git a/yt_dlp/extractor/cnbc.py b/yt_dlp/extractor/cnbc.py index 68fd025b7c..7d209b6d90 100644 --- a/yt_dlp/extractor/cnbc.py +++ b/yt_dlp/extractor/cnbc.py @@ -19,6 +19,7 @@ class CNBCIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Dead link', } def _real_extract(self, url): @@ -49,6 +50,7 @@ class CNBCVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Dead link', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py index c03d65310d..bcc34ddd8a 100644 --- a/yt_dlp/extractor/corus.py +++ b/yt_dlp/extractor/corus.py @@ -41,7 +41,7 @@ class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE ) ''' _TESTS = [{ - 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', + 'url': 'https://www.hgtv.ca/video/bryan-inc/movie-night-popcorn-with-bryan/870923331648/', 'info_dict': { 'id': '870923331648', 'ext': 'mp4', @@ -54,6 +54,7 @@ class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], + # FIXME: yt-dlp wrongly raises for geo restriction }, { 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', 'only_matching': True, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 33e71d1c57..5e1240c13a 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -58,6 +58,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', + 'direct': True, + 'timestamp': 1273772943.0, } }, # Direct link to media delivered compressed (until Accept-Encoding is *) @@ -101,6 +103,8 @@ class GenericIE(InfoExtractor): 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', + 'direct': True, + 'timestamp': 1416498816.0, }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' @@ -133,6 +137,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20201204', }, }], + 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { @@ -145,12 +150,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', + 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': 459, + 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, @@ -267,6 +272,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { @@ -278,6 +284,7 @@ class GenericIE(InfoExtractor): 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', + 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 @@ -318,7 +325,7 @@ class GenericIE(InfoExtractor): 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', - 'uploader_id': 'TheVerge', + 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index e3b728dcae..2d62042982 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -127,7 +127,8 @@ class MediasetIE(ThePlatformBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Dead link', }, { # WittyTV embed 'url': 'https://www.wittytv.it/mauriziocostanzoshow/ultima-puntata-venerdi-25-novembre/', diff --git a/yt_dlp/extractor/movieclips.py b/yt_dlp/extractor/movieclips.py index 4777f440e0..f7f2921fdb 100644 --- a/yt_dlp/extractor/movieclips.py +++ b/yt_dlp/extractor/movieclips.py @@ -23,6 +23,7 @@ class MovieClipsIE(InfoExtractor): 'uploader': 'Movieclips', }, 'add_ie': ['ThePlatform'], + 'skip': 'redirects to YouTube', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/nationalgeographic.py b/yt_dlp/extractor/nationalgeographic.py index ad525c2589..6f046bc29c 100644 --- a/yt_dlp/extractor/nationalgeographic.py +++ b/yt_dlp/extractor/nationalgeographic.py @@ -24,6 +24,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, { 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', @@ -38,6 +39,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, ] @@ -75,6 +77,7 @@ class NationalGeographicTVIE(FOXIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': 'Content not available', }] _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index b3c28ab55d..666550a491 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -284,7 +284,7 @@ class NBCSportsIE(InfoExtractor): _TESTS = [{ # iframe src - 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation', 'info_dict': { 'id': 'PHJSaFWbrTY9', 'ext': 'mp4', @@ -379,7 +379,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', + 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate 'info_dict': { 'id': '269389891880', 'ext': 'mp4', @@ -387,6 +387,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 'timestamp': 1401363060, 'upload_date': '20140529', + 'duration': 46.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg', }, }, { @@ -402,7 +404,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '8eb831eca25bfa7d25ddd83e85946548', + 'md5': '40d0e48c68896359c80372306ece0fc3', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', @@ -410,11 +412,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, 'upload_date': '20150205', + 'duration': 1236.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', + 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939', 'info_dict': { 'id': 'n431456', 'ext': 'mp4', @@ -422,11 +426,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, + 'duration': 37.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg', }, }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', - 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'md5': '693d1fa21d23afcc9b04c66b227ed9ff', 'info_dict': { 'id': '669831235788', 'ext': 'mp4', @@ -434,6 +440,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'duration': 69.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg', }, }, { @@ -447,6 +455,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'duration': 940.0, }, }, { @@ -535,6 +544,7 @@ class NBCOlympicsIE(InfoExtractor): 'upload_date': '20160815', 'uploader': 'NBCU-SPORTS', }, + 'skip': '404 Not Found', } def _real_extract(self, url): @@ -578,6 +588,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, { 'note': 'Plain m3u8 source URL', 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', @@ -589,6 +600,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, ] diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index adfd7e5f29..7f0bc96456 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -39,6 +39,7 @@ class ScrippsNetworksWatchIE(AWSIE): 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], + 'skip': '404 Not Found', }] _SNI_TABLE = { @@ -113,6 +114,9 @@ class ScrippsNetworksIE(InfoExtractor): 'timestamp': 1475678834, 'upload_date': '20161005', 'uploader': 'SCNI-SCND', + 'duration': 29.995, + 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': ''}], + 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', }, 'add_ie': ['ThePlatform'], 'expected_warnings': ['No HLS formats found'], diff --git a/yt_dlp/extractor/syfy.py b/yt_dlp/extractor/syfy.py index c79d27a0de..afcdbf7804 100644 --- a/yt_dlp/extractor/syfy.py +++ b/yt_dlp/extractor/syfy.py @@ -23,6 +23,7 @@ class SyfyIE(AdobePassIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 99caeb5f99..433ce8427c 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -167,7 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): # rtmp download 'skip_download': True, }, - 'skip': '404 Not Found', + 'skip': 'CNet no longer uses ThePlatform', }, { 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', 'info_dict': { @@ -177,7 +177,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', 'uploader': 'EGSM', }, - 'skip': '404 Not Found', + 'skip': 'Dead link', }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, @@ -195,7 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', }, - 'skip': '404 Not Found', + 'skip': 'Error: Player PID "nbcNewsOffsite" is disabled', }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # geo-restricted (US), HLS encrypted with AES-128 diff --git a/yt_dlp/extractor/theweatherchannel.py b/yt_dlp/extractor/theweatherchannel.py index 682e4335d2..d1921e4f9a 100644 --- a/yt_dlp/extractor/theweatherchannel.py +++ b/yt_dlp/extractor/theweatherchannel.py @@ -11,17 +11,19 @@ class TheWeatherChannelIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?weather\.com(?P(?:/(?P[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P[^/?#]+))' _TESTS = [{ - 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', + 'url': 'https://weather.com/storms/hurricane/video/invest-95l-in-atlantic-has-a-medium-chance-of-development', + 'md5': '68f0cf616435683f27ce36bd9c927394', 'info_dict': { - 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', + 'id': '81acef2d-ee8c-4545-ba83-bff3cc80db97', 'ext': 'mp4', - 'title': 'Ice Climber Is In For A Shock', - 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', - 'uploader': 'TWC - Digital (No Distro)', - 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', - 'upload_date': '20160720', - 'timestamp': 1469018835, + 'title': 'Invest 95L In Atlantic Has A Medium Chance Of Development', + 'description': 'md5:0de720fd5f0d0e32207bd4c270fff824', + 'uploader': 'TWC - Digital', + 'uploader_id': 'b5a999e0-9e04-11e1-9ee2-001d092f5a10', + 'upload_date': '20230721', + 'timestamp': 1689967343, + 'display_id': 'invest-95l-in-atlantic-has-a-medium-chance-of-development', + 'duration': 34.0, } }, { 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', From 792f1e64f6a2beac51e85408d142b3118115c4fd Mon Sep 17 00:00:00 2001 From: Aleri Kaisattera <73682764+alerikaisattera@users.noreply.github.com> Date: Sat, 7 Oct 2023 05:56:47 +0600 Subject: [PATCH 443/501] [ie/theta] Remove extractors (#8251) Authored by: alerikaisattera --- yt_dlp/extractor/_extractors.py | 4 -- yt_dlp/extractor/theta.py | 90 --------------------------------- 2 files changed, 94 deletions(-) delete mode 100644 yt_dlp/extractor/theta.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b10ef2f332..55c3c2f8e8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2004,10 +2004,6 @@ ) from .thestar import TheStarIE from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE diff --git a/yt_dlp/extractor/theta.py b/yt_dlp/extractor/theta.py deleted file mode 100644 index ecf0ea091d..0000000000 --- a/yt_dlp/extractor/theta.py +++ /dev/null @@ -1,90 +0,0 @@ -from .common import InfoExtractor -from ..utils import try_get - - -class ThetaStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P[a-z0-9-]+)' - _TESTS = [{ - 'url': 'https://www.theta.tv/davirus', - 'skip': 'The live may have ended', - 'info_dict': { - 'id': 'DaVirus', - 'ext': 'mp4', - 'title': 'I choose you - My Community is King -👀 - YO HABLO ESPANOL - CODE DAVIRUS', - 'thumbnail': r're:https://live-thumbnails-prod-theta-tv\.imgix\.net/thumbnail/.+\.jpg', - } - }, { - 'url': 'https://www.theta.tv/mst3k', - 'note': 'This channel is live 24/7', - 'info_dict': { - 'id': 'MST3K', - 'ext': 'mp4', - 'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', - } - }, { - 'url': 'https://www.theta.tv/contv-anime', - 'info_dict': { - 'id': 'ConTVAnime', - 'ext': 'mp4', - 'title': 'CONTV ANIME 24/7. Powered by THETA Network.', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', - } - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - info = self._download_json(f'https://api.theta.tv/v1/channel?alias={channel_id}', channel_id)['body'] - - m3u8_playlist = next( - data['url'] for data in info['live_stream']['video_urls'] - if data.get('type') != 'embed' and data.get('resolution') in ('master', 'source')) - - formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) - - channel = try_get(info, lambda x: x['user']['username']) # using this field instead of channel_id due to capitalization - - return { - 'id': channel, - 'title': try_get(info, lambda x: x['live_stream']['title']), - 'channel': channel, - 'view_count': try_get(info, lambda x: x['live_stream']['view_count']), - 'is_live': True, - 'formats': formats, - 'thumbnail': try_get(info, lambda x: x['live_stream']['thumbnail_url']), - } - - -class ThetaVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/video/(?Pvid[a-z0-9]+)' - _TEST = { - 'url': 'https://www.theta.tv/video/vidiq6aaet3kzf799p0', - 'md5': '633d8c29eb276bb38a111dbd591c677f', - 'info_dict': { - 'id': 'vidiq6aaet3kzf799p0', - 'ext': 'mp4', - 'title': 'Theta EdgeCast Tutorial', - 'uploader': 'Pixiekittie', - 'description': 'md5:e316253f5bdced8b5a46bb50ae60a09f', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+/vod_thumb/.+.jpg', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json(f'https://api.theta.tv/v1/video/{video_id}/raw', video_id)['body'] - - m3u8_playlist = try_get(info, lambda x: x['video_urls'][0]['url']) - - formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': info.get('title'), - 'uploader': try_get(info, lambda x: x['user']['username']), - 'description': info.get('description'), - 'view_count': info.get('view_count'), - 'like_count': info.get('like_count'), - 'formats': formats, - 'thumbnail': info.get('thumbnail_url'), - } From 03e85ea99db76a2fddb65bf46f8819bda780aaf3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Oct 2023 20:00:15 -0500 Subject: [PATCH 444/501] [ie/youtube] Fix `heatmap` extraction (#8299) Closes #8189 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7e13aa7797..b7ac3e9cc1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3292,16 +3292,15 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) - def _extract_heatmap_from_player_overlay(self, data): - content_list = traverse_obj(data, ( - 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', - 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) - return next(filter(None, ( - traverse_obj(contents, (..., 'heatMarkerRenderer', { - 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), - 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, - 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), - })) for contents in content_list)), None) + def _extract_heatmap(self, data): + return traverse_obj(data, ( + 'frameworkUpdates', 'entityBatchUpdate', 'mutations', + lambda _, v: v['payload']['macroMarkersListEntity']['markersList']['markerType'] == 'MARKER_TYPE_HEATMAP', + 'payload', 'macroMarkersListEntity', 'markersList', 'markers', ..., { + 'start_time': ('startMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (int(x['startMillis']) + int(x['durationMillis'])) / 1000}, + 'value': ('intensityScoreNormalized', {float_or_none}), + })) or None def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -4435,7 +4434,7 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) - info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + info['heatmap'] = self._extract_heatmap(initial_data) contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), From 377e85a1797db9e98b78b38203ed9d4ded229991 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 7 Oct 2023 03:02:45 +0200 Subject: [PATCH 445/501] [cleanup] Misc (#8300) * Simplify nuxt regex * Fix tmz quotes and tests * Update test python versions Authored by: dirkf, gamer191, Grub4K --- .github/workflows/core.yml | 4 +- .github/workflows/download.yml | 2 +- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/tmz.py | 266 +++++++++++++++++---------------- 4 files changed, 138 insertions(+), 136 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 689408c500..7fcf11dfa2 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -13,7 +13,7 @@ jobs: matrix: os: [ubuntu-latest] # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10] + python-version: ['3.8', '3.9', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows @@ -21,7 +21,7 @@ jobs: python-version: '3.7' run-tests-ext: bat - os: windows-latest - python-version: '3.12-dev' + python-version: '3.12' run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 2b2387d4f1..c3478721c3 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -28,7 +28,7 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.10', 3.11-dev, pypy-3.7, pypy-3.8] + python-version: ['3.7', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c94b4abdc2..c3ceb00391 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1687,7 +1687,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){(?:.*?)return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' + FUNCTION_RE = r'\(function\((?P.*?)\){.*?\breturn\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), diff --git a/yt_dlp/extractor/tmz.py b/yt_dlp/extractor/tmz.py index ffb30c6b87..edd16bc5b2 100644 --- a/yt_dlp/extractor/tmz.py +++ b/yt_dlp/extractor/tmz.py @@ -8,158 +8,160 @@ class TMZIE(InfoExtractor): - _VALID_URL = r"https?://(?:www\.)?tmz\.com/.*" + _VALID_URL = r'https?://(?:www\.)?tmz\.com/.*' _TESTS = [ { - "url": "http://www.tmz.com/videos/0-cegprt2p/", - "info_dict": { - "id": "http://www.tmz.com/videos/0-cegprt2p/", - "ext": "mp4", - "title": "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", - "description": "Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.", - "timestamp": 1467831837, - "uploader": "TMZ Staff", - "upload_date": "20160706", - "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg", - "duration": 772.0, + 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'info_dict': { + 'id': 'http://www.tmz.com/videos/0-cegprt2p/', + 'ext': 'mp4', + 'title': 'No Charges Against Hillary Clinton? Harvey Says It Ain\'t Over Yet', + 'description': 'Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.', + 'timestamp': 1467831837, + 'uploader': 'TMZ Staff', + 'upload_date': '20160706', + 'thumbnail': 'https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg', + 'duration': 772.0, }, }, { - "url": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", - "info_dict": { - "id": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", - "ext": "mp4", - "title": "Angry Bagel Shop Guy Says He Doesn't Trust Women", - "description": "The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it's women's fault in the first place.", - "timestamp": 1562889485, - "uploader": "TMZ Staff", - "upload_date": "20190711", - "thumbnail": "https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg", - "duration": 123.0, + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'info_dict': { + 'id': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'ext': 'mp4', + 'title': 'Angry Bagel Shop Guy Says He Doesn\'t Trust Women', + 'description': 'The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it\'s women\'s fault in the first place.', + 'timestamp': 1562889485, + 'uploader': 'TMZ Staff', + 'upload_date': '20190711', + 'thumbnail': 'https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg', + 'duration': 123.0, }, }, { - "url": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", - "md5": "5429c85db8bde39a473a56ca8c4c5602", - "info_dict": { - "id": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", - "ext": "mp4", - "title": "Bobby Brown Tells Crowd ... Bobbi Kristina is Awake", - "description": 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - "timestamp": 1429467813, - "uploader": "TMZ Staff", - "upload_date": "20150419", - "duration": 29.0, - "thumbnail": "https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg", + 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'md5': '5429c85db8bde39a473a56ca8c4c5602', + 'info_dict': { + 'id': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'ext': 'mp4', + 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', + 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + 'timestamp': 1429467813, + 'uploader': 'TMZ Staff', + 'upload_date': '20150419', + 'duration': 29.0, + 'thumbnail': 'https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg', }, }, { - "url": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", - "info_dict": { - "id": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", - "ext": "mp4", - "title": "Patti LaBelle -- Goes Nuclear On Stripping Fan", - "description": "Patti LaBelle made it known loud and clear last night ... NO " - "ONE gets on her stage and strips down.", - "timestamp": 1442683746, - "uploader": "TMZ Staff", - "upload_date": "20150919", - "duration": 104.0, - "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg", + 'url': 'http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/', + 'info_dict': { + 'id': 'http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/', + 'ext': 'mp4', + 'title': 'Patti LaBelle -- Goes Nuclear On Stripping Fan', + 'description': 'Patti LaBelle made it known loud and clear last night ... NO ' + 'ONE gets on her stage and strips down.', + 'timestamp': 1442683746, + 'uploader': 'TMZ Staff', + 'upload_date': '20150919', + 'duration': 104.0, + 'thumbnail': 'https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg', }, }, { - "url": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", - "info_dict": { - "id": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", - "ext": "mp4", - "title": "NBA's Adam Silver -- Blake Griffin's a Great Guy ... He'll Learn from This", - "description": "Two pretty parts of this video with NBA Commish Adam Silver.", - "timestamp": 1454010989, - "uploader": "TMZ Staff", - "upload_date": "20160128", - "duration": 59.0, - "thumbnail": "https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg", + 'url': 'http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/', + 'info_dict': { + 'id': 'http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/', + 'ext': 'mp4', + 'title': 'NBA\'s Adam Silver -- Blake Griffin\'s a Great Guy ... He\'ll Learn from This', + 'description': 'Two pretty parts of this video with NBA Commish Adam Silver.', + 'timestamp': 1454010989, + 'uploader': 'TMZ Staff', + 'upload_date': '20160128', + 'duration': 59.0, + 'thumbnail': 'https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg', }, }, { - "url": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", - "info_dict": { - "id": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", - "ext": "mp4", - "title": "Trump Star Vandal -- I'm Not Afraid of Donald or the Cops!", - "description": "James Otis is the the guy who took a pickaxe to Donald Trump's star on the Walk of Fame, and he tells TMZ .. he's ready and willing to go to jail for the crime.", - "timestamp": 1477500095, - "uploader": "TMZ Staff", - "upload_date": "20161026", - "thumbnail": "https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg", - "duration": 128.0, + 'url': 'http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/', + 'info_dict': { + 'id': 'http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/', + 'ext': 'mp4', + 'title': 'Trump Star Vandal -- I\'m Not Afraid of Donald or the Cops!', + 'description': 'James Otis is the the guy who took a pickaxe to Donald Trump\'s star on the Walk of Fame, and he tells TMZ .. he\'s ready and willing to go to jail for the crime.', + 'timestamp': 1477500095, + 'uploader': 'TMZ Staff', + 'upload_date': '20161026', + 'thumbnail': 'https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg', + 'duration': 128.0, }, }, { - "url": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", - "info_dict": { - "id": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", - "ext": "mp4", - "title": "Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist " - "Demonstrators", - "description": "Beverly Hills may be an omen of what's coming next week, " - "because things got crazy on the streets and cops started " - "swinging their billy clubs at both Anti-Fascist and Pro-Trump " - "demonstrators.", - "timestamp": 1604182772, - "uploader": "TMZ Staff", - "upload_date": "20201031", - "duration": 96.0, - "thumbnail": "https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg", + 'url': 'https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/', + 'info_dict': { + 'id': 'https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/', + 'ext': 'mp4', + 'title': 'Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist ' + 'Demonstrators', + 'description': 'Beverly Hills may be an omen of what\'s coming next week, ' + 'because things got crazy on the streets and cops started ' + 'swinging their billy clubs at both Anti-Fascist and Pro-Trump ' + 'demonstrators.', + 'timestamp': 1604182772, + 'uploader': 'TMZ Staff', + 'upload_date': '20201031', + 'duration': 96.0, + 'thumbnail': 'https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg', }, }, { - "url": "https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/", - "info_dict": { - "id": "Dddb6IGe-ws", - "ext": "mp4", - "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing", - "uploader": "ESNEWS", - "description": "md5:49675bc58883ccf80474b8aa701e1064", - "upload_date": "20201102", - "uploader_id": "ESNEWS", - "uploader_url": "http://www.youtube.com/user/ESNEWS", - "like_count": int, - "channel_id": "UCI-Oq7oFGakzSzHFlTtsUsQ", - "channel": "ESNEWS", - "view_count": int, - "duration": 225, - "live_status": "not_live", - "thumbnail": "https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp", - "channel_url": "https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ", - "channel_follower_count": int, - "playable_in_embed": True, - "categories": ["Sports"], - "age_limit": 0, - "tags": "count:10", - "availability": "public", + 'url': 'https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/', + 'info_dict': { + 'id': 'Dddb6IGe-ws', + 'ext': 'mp4', + 'title': 'SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing', + 'uploader': 'ESNEWS', + 'description': 'md5:49675bc58883ccf80474b8aa701e1064', + 'upload_date': '20201102', + 'uploader_id': '@ESNEWS', + 'uploader_url': 'https://www.youtube.com/@ESNEWS', + 'like_count': int, + 'channel_id': 'UCI-Oq7oFGakzSzHFlTtsUsQ', + 'channel': 'ESNEWS', + 'view_count': int, + 'duration': 225, + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp', + 'channel_url': 'https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'categories': ['Sports'], + 'age_limit': 0, + 'tags': 'count:10', + 'availability': 'public', + 'comment_count': int, }, }, { - "url": "https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/", - "info_dict": { - "id": "1329450007125225473", - "ext": "mp4", - "title": "The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", - "uploader": "The Mac Life", - "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69", - "upload_date": "20201119", - "uploader_id": "TheMacLife", - "timestamp": 1605800556, - "thumbnail": "https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small", - "like_count": int, - "duration": 11.812, - "uploader_url": "https://twitter.com/TheMacLife", - "age_limit": 0, - "repost_count": int, - "tags": [], - "comment_count": int, + 'url': 'https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/', + 'info_dict': { + 'id': '1329448013937471491', + 'ext': 'mp4', + 'title': 'The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.', + 'uploader': 'The Mac Life', + 'description': 'md5:56e6009bbc3d12498e10d08a8e1f1c69', + 'upload_date': '20201119', + 'display_id': '1329450007125225473', + 'uploader_id': 'TheMacLife', + 'timestamp': 1605800556, + 'thumbnail': 'https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small', + 'like_count': int, + 'duration': 11.812, + 'uploader_url': 'https://twitter.com/TheMacLife', + 'age_limit': 0, + 'repost_count': int, + 'tags': [], + 'comment_count': int, }, }, ] @@ -167,25 +169,25 @@ class TMZIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url) jsonld = self._search_json_ld(webpage, url) - if not jsonld or "url" not in jsonld: + if not jsonld or 'url' not in jsonld: # try to extract from YouTube Player API # see https://developers.google.com/youtube/iframe_api_reference#Video_Queueing_Functions match_obj = re.search(r'\.cueVideoById\(\s*(?P[\'"])(?P.*?)(?P=quote)', webpage) if match_obj: - res = self.url_result(match_obj.group("id")) + res = self.url_result(match_obj.group('id')) return res # try to extract from twitter - blockquote_el = get_element_by_attribute("class", "twitter-tweet", webpage) + blockquote_el = get_element_by_attribute('class', 'twitter-tweet', webpage) if blockquote_el: matches = re.findall( r']+href=\s*(?P[\'"])(?P.*?)(?P=quote)', blockquote_el) if matches: for _, match in matches: - if "/status/" in match: + if '/status/' in match: res = self.url_result(match) return res - raise ExtractorError("No video found!") + raise ExtractorError('No video found!') if id not in jsonld: - jsonld["id"] = url + jsonld['id'] = url return jsonld From 4392c4680c383b221b6aa26d25c6e4b5581a5ad6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 7 Oct 2023 01:28:34 +0000 Subject: [PATCH 446/501] Release 2023.10.07 Created by: Grub4K :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++--- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++--- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++--- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++--- CONTRIBUTORS | 6 ++++ Changelog.md | 29 +++++++++++++++++++ supportedsites.md | 4 +-- yt_dlp/version.py | 4 +-- 10 files changed, 63 insertions(+), 28 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index f0fc71d575..dacb41758d 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ac9a72a1c1..ec6e298a19 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 577e4d4910..cf3cdd21f3 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 9529c1bd6c..1bbcf68956 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index b17a6e046c..d3bc06e809 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 5345e8917c..30311d5b56 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 72b9584ecf..8eda413072 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -503,3 +503,9 @@ Yalab7 zhallgato zhong-yiyu Zprokkel +AS6939 +drzraf +handlerug +jiru +madewokherd +xofe diff --git a/Changelog.md b/Changelog.md index 04511927fa..48dcbf1029 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,35 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.10.07 + +#### Extractor changes +- **abc.net.au**: iview: [Improve `episode` extraction](https://github.com/yt-dlp/yt-dlp/commit/a9efb4b8d74f3583450ffda0ee57259a47d39c70) ([#8201](https://github.com/yt-dlp/yt-dlp/issues/8201)) by [xofe](https://github.com/xofe) +- **erocast**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/47c598783c98c179e04dd12c2a3fee0f3dc53087) ([#8264](https://github.com/yt-dlp/yt-dlp/issues/8264)) by [madewokherd](https://github.com/madewokherd) +- **gofile**: [Fix token cookie bug](https://github.com/yt-dlp/yt-dlp/commit/0730d5a966fa8a937d84bfb7f68be5198acb039b) by [bashonly](https://github.com/bashonly) +- **iq.com**: [Fix extraction and subtitles](https://github.com/yt-dlp/yt-dlp/commit/35d9cbaf9638ccc9daf8a863063b2e7c135bc664) ([#8260](https://github.com/yt-dlp/yt-dlp/issues/8260)) by [AS6939](https://github.com/AS6939) +- **lbry** + - [Add playlist support](https://github.com/yt-dlp/yt-dlp/commit/48cceec1ddb8649b5e771df8df79eb9c39c82b90) ([#8213](https://github.com/yt-dlp/yt-dlp/issues/8213)) by [bashonly](https://github.com/bashonly), [drzraf](https://github.com/drzraf), [Grub4K](https://github.com/Grub4K) + - [Extract `uploader_id`](https://github.com/yt-dlp/yt-dlp/commit/0e722f2f3ca42e634fd7b06ee70b16bf833ce132) ([#8244](https://github.com/yt-dlp/yt-dlp/issues/8244)) by [drzraf](https://github.com/drzraf) +- **litv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/91a670a4f7babe9c8aa2018f57d8c8952a6f49d8) ([#7785](https://github.com/yt-dlp/yt-dlp/issues/7785)) by [jiru](https://github.com/jiru) +- **neteasemusic**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/f980df734cf5c0eaded2f7b38c6c60bccfeebb48) ([#8181](https://github.com/yt-dlp/yt-dlp/issues/8181)) by [c-basalt](https://github.com/c-basalt) +- **nhk**: [Fix VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/e831c80e8b2fc025b3b67d82974cc59e3526fdc8) ([#8249](https://github.com/yt-dlp/yt-dlp/issues/8249)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Improve extraction](https://github.com/yt-dlp/yt-dlp/commit/2ad3873f0dfa9285c91d2160e36c039e69d597c7) ([#8221](https://github.com/yt-dlp/yt-dlp/issues/8221)) by [garret1317](https://github.com/garret1317) +- **substack** + - [Fix download cookies bug](https://github.com/yt-dlp/yt-dlp/commit/2f2dda3a7e85148773da3cdbc03ac9949ec1bc45) ([#8219](https://github.com/yt-dlp/yt-dlp/issues/8219)) by [handlerug](https://github.com/handlerug) + - [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/fbcc299bd8a19cf8b3c8805d6c268a9110230973) ([#8218](https://github.com/yt-dlp/yt-dlp/issues/8218)) by [handlerug](https://github.com/handlerug) +- **theta**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/792f1e64f6a2beac51e85408d142b3118115c4fd) ([#8251](https://github.com/yt-dlp/yt-dlp/issues/8251)) by [alerikaisattera](https://github.com/alerikaisattera) +- **wrestleuniversevod**: [Call API with device ID](https://github.com/yt-dlp/yt-dlp/commit/b095fd3fa9d58a65dc9b830bd63b9d909422aa86) ([#8272](https://github.com/yt-dlp/yt-dlp/issues/8272)) by [bashonly](https://github.com/bashonly) +- **xhamster**: user: [Support creator urls](https://github.com/yt-dlp/yt-dlp/commit/cc8d8441524ec3442d7c0d3f8f33f15b66aa06f3) ([#8232](https://github.com/yt-dlp/yt-dlp/issues/8232)) by [Grub4K](https://github.com/Grub4K) +- **youtube** + - [Fix `heatmap` extraction](https://github.com/yt-dlp/yt-dlp/commit/03e85ea99db76a2fddb65bf46f8819bda780aaf3) ([#8299](https://github.com/yt-dlp/yt-dlp/issues/8299)) by [bashonly](https://github.com/bashonly) + - [Raise a warning for `Incomplete Data` instead of an error](https://github.com/yt-dlp/yt-dlp/commit/eb5bdbfa70126c7d5355cc0954b63720522e462c) ([#8238](https://github.com/yt-dlp/yt-dlp/issues/8238)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup** + - [Update extractor tests](https://github.com/yt-dlp/yt-dlp/commit/19c90e405b4137c06dfe6f9aaa02396df0da93e5) ([#7718](https://github.com/yt-dlp/yt-dlp/issues/7718)) by [trainman261](https://github.com/trainman261) + - Miscellaneous: [377e85a](https://github.com/yt-dlp/yt-dlp/commit/377e85a1797db9e98b78b38203ed9d4ded229991) by [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K) + ### 2023.09.24 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 620e0f3058..ecef4dc2d1 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -422,6 +422,7 @@ # Supported sites - **eplus:inbound**: e+ (イープラス) overseas - **Epoch** - **Eporner** + - **Erocast** - **EroProfile**: [*eroprofile*](## "netrc machine") - **EroProfile:album** - **ertflix**: ERTFLIX videos @@ -699,6 +700,7 @@ # Supported sites - **LastFMUser** - **lbry** - **lbry:channel** + - **lbry:playlist** - **LCI** - **Lcp** - **LcpPlay** @@ -1474,8 +1476,6 @@ # Supported sites - **ThePlatformFeed** - **TheStar** - **TheSun** - - **ThetaStream** - - **ThetaVideo** - **TheWeatherChannel** - **ThisAmericanLife** - **ThisAV** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 2a7c84b93f..60c1c94cc3 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.09.24' +__version__ = '2023.10.07' -RELEASE_GIT_HEAD = '088add9567d39b758737e4299a0e619fd89d2e8f' +RELEASE_GIT_HEAD = '377e85a1797db9e98b78b38203ed9d4ded229991' VARIANT = None From 9d7ded6419089c1bf252496073f73ad90ed71004 Mon Sep 17 00:00:00 2001 From: Awal Garg Date: Sun, 8 Oct 2023 01:57:23 +0200 Subject: [PATCH 447/501] [utils] `js_to_json`: Fix `Date` constructor parsing (#8295) Authored by: awalgarg, Grub4K --- test/test_utils.py | 7 ++++++- yt_dlp/utils/_utils.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index fd612ff86f..77040f29c6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1209,6 +1209,9 @@ def test_js_to_json_edgecases(self): on = js_to_json('\'"\\""\'') self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped') + on = js_to_json('[new Date("spam"), \'("eggs")\']') + self.assertEqual(json.loads(on), ['spam', '("eggs")'], msg='Date regex should match a single string') + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') @@ -1220,11 +1223,13 @@ def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') self.assertEqual(js_to_json('`${name}`', {}), '"name"') - def test_js_to_json_map_array_constructors(self): + def test_js_to_json_common_constructors(self): self.assertEqual(json.loads(js_to_json('new Map([["a", 5]])')), {'a': 5}) self.assertEqual(json.loads(js_to_json('Array(5, 10)')), [5, 10]) self.assertEqual(json.loads(js_to_json('new Array(15,5)')), [15, 5]) self.assertEqual(json.loads(js_to_json('new Map([Array(5, 10),new Array(15,5)])')), {'5': 10, '15': 5}) + self.assertEqual(json.loads(js_to_json('new Date("123")')), "123") + self.assertEqual(json.loads(js_to_json('new Date(\'2023-10-19\')')), "2023-10-19") def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index ba62423806..3dc17bf593 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2744,7 +2744,7 @@ def create_map(mobj): code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) if not strict: - code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code) code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) From 1c51c520f7b511ebd9e4eb7322285a8c31eedbbd Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 8 Oct 2023 02:01:01 +0200 Subject: [PATCH 448/501] [fd/fragment] Improve progress calculation (#8241) This uses the download speed from all threads and also adds smoothing to speed and eta Authored by: Grub4K --- yt_dlp/downloader/fragment.py | 48 ++++++--------- yt_dlp/utils/progress.py | 109 ++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 29 deletions(-) create mode 100644 yt_dlp/utils/progress.py diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index b4b680dae1..b4f003d37f 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -14,6 +14,7 @@ from ..networking.exceptions import HTTPError, IncompleteRead from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj from ..utils.networking import HTTPHeaderDict +from ..utils.progress import ProgressCalculator class HttpQuietDownloader(HttpFD): @@ -226,8 +227,7 @@ def _start_frag_download(self, ctx, info_dict): resume_len = ctx['complete_frags_downloaded_bytes'] total_frags = ctx['total_frags'] ctx_id = ctx.get('ctx_id') - # This dict stores the download progress, it's updated by the progress - # hook + # Stores the download progress, updated by the progress hook state = { 'status': 'downloading', 'downloaded_bytes': resume_len, @@ -237,14 +237,8 @@ def _start_frag_download(self, ctx, info_dict): 'tmpfilename': ctx['tmpfilename'], } - start = time.time() - ctx.update({ - 'started': start, - 'fragment_started': start, - # Amount of fragment's bytes downloaded by the time of the previous - # frag progress hook invocation - 'prev_frag_downloaded_bytes': 0, - }) + ctx['started'] = time.time() + progress = ProgressCalculator(resume_len) def frag_progress_hook(s): if s['status'] not in ('downloading', 'finished'): @@ -259,38 +253,35 @@ def frag_progress_hook(s): state['max_progress'] = ctx.get('max_progress') state['progress_idx'] = ctx.get('progress_idx') - time_now = time.time() - state['elapsed'] = time_now - start + state['elapsed'] = progress.elapsed frag_total_bytes = s.get('total_bytes') or 0 s['fragment_info_dict'] = s.pop('info_dict', {}) + + # XXX: Fragment resume is not accounted for here if not ctx['live']: estimated_size = ( (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / (state['fragment_index'] + 1) * total_frags) - state['total_bytes_estimate'] = estimated_size + progress.total = estimated_size + progress.update(s.get('downloaded_bytes')) + state['total_bytes_estimate'] = progress.total + else: + progress.update(s.get('downloaded_bytes')) if s['status'] == 'finished': state['fragment_index'] += 1 ctx['fragment_index'] = state['fragment_index'] - state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] - ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_total_bytes) - ctx['fragment_started'] = time.time() - ctx['prev_frag_downloaded_bytes'] = 0 - else: - frag_downloaded_bytes = s['downloaded_bytes'] - state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) - if not ctx['live']: - state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) - ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes + progress.thread_reset() + + state['downloaded_bytes'] = ctx['complete_frags_downloaded_bytes'] = progress.downloaded + state['speed'] = ctx['speed'] = progress.speed.smooth + state['eta'] = progress.eta.smooth + self._hook_progress(state, info_dict) ctx['dl'].add_progress_hook(frag_progress_hook) - return start + return ctx['started'] def _finish_frag_download(self, ctx, info_dict): ctx['dest_stream'].close() @@ -500,7 +491,6 @@ def _download_fragment(fragment): download_fragment(fragment, ctx_copy) return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') - self.report_warning('The download speed shown is only of one thread. This is a known issue') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: try: for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): diff --git a/yt_dlp/utils/progress.py b/yt_dlp/utils/progress.py new file mode 100644 index 0000000000..f254a3887e --- /dev/null +++ b/yt_dlp/utils/progress.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import bisect +import threading +import time + + +class ProgressCalculator: + # Time to calculate the speed over (seconds) + SAMPLING_WINDOW = 3 + # Minimum timeframe before to sample next downloaded bytes (seconds) + SAMPLING_RATE = 0.05 + # Time before showing eta (seconds) + GRACE_PERIOD = 1 + + def __init__(self, initial: int): + self._initial = initial or 0 + self.downloaded = self._initial + + self.elapsed: float = 0 + self.speed = SmoothValue(0, smoothing=0.7) + self.eta = SmoothValue(None, smoothing=0.9) + + self._total = 0 + self._start_time = time.monotonic() + self._last_update = self._start_time + + self._lock = threading.Lock() + self._thread_sizes: dict[int, int] = {} + + self._times = [self._start_time] + self._downloaded = [self.downloaded] + + @property + def total(self): + return self._total + + @total.setter + def total(self, value: int | None): + with self._lock: + if value is not None and value < self.downloaded: + value = self.downloaded + + self._total = value + + def thread_reset(self): + current_thread = threading.get_ident() + with self._lock: + self._thread_sizes[current_thread] = 0 + + def update(self, size: int | None): + if not size: + return + + current_thread = threading.get_ident() + + with self._lock: + last_size = self._thread_sizes.get(current_thread, 0) + self._thread_sizes[current_thread] = size + self._update(size - last_size) + + def _update(self, size: int): + current_time = time.monotonic() + + self.downloaded += size + self.elapsed = current_time - self._start_time + if self.total is not None and self.downloaded > self.total: + self._total = self.downloaded + + if self._last_update + self.SAMPLING_RATE > current_time: + return + self._last_update = current_time + + self._times.append(current_time) + self._downloaded.append(self.downloaded) + + offset = bisect.bisect_left(self._times, current_time - self.SAMPLING_WINDOW) + del self._times[:offset] + del self._downloaded[:offset] + if len(self._times) < 2: + self.speed.reset() + self.eta.reset() + return + + download_time = current_time - self._times[0] + if not download_time: + return + + self.speed.set((self.downloaded - self._downloaded[0]) / download_time) + if self.total and self.speed.value and self.elapsed > self.GRACE_PERIOD: + self.eta.set((self.total - self.downloaded) / self.speed.value) + else: + self.eta.reset() + + +class SmoothValue: + def __init__(self, initial: float | None, smoothing: float): + self.value = self.smooth = self._initial = initial + self._smoothing = smoothing + + def set(self, value: float): + self.value = value + if self.smooth is None: + self.smooth = self.value + else: + self.smooth = (1 - self._smoothing) * value + self._smoothing * self.smooth + + def reset(self): + self.value = self.smooth = self._initial From b7098d46b552a9322c6cea39ba80be5229f922de Mon Sep 17 00:00:00 2001 From: naginatana <96737708+naginatana@users.noreply.github.com> Date: Tue, 10 Oct 2023 01:46:16 +0800 Subject: [PATCH 449/501] [ie/youku] Improve tudou.com support (#8160) Authored by: naginatana --- yt_dlp/extractor/youku.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index 7ecd9f1839..e351765868 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -20,7 +20,7 @@ class YoukuIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://( - (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + (?:v|play(?:er)?)\.(?:youku|tudou)\.com/(?:v_show/id_|player\.php/sid/)| video\.tudou\.com/v/)| youku:) (?P[A-Za-z0-9]+)(?:\.html|/v\.swf|) @@ -87,6 +87,19 @@ class YoukuIE(InfoExtractor): 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjU2MzY1MzM1Ng==', 'tags': list, }, + }, { + 'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng==.html?', + 'info_dict': { + 'id': 'XNjAxNjI2OTU3Ng', + 'ext': 'mp4', + 'title': '阿斯塔意识到哈里杀了人,自己被骗了', + 'thumbnail': 'https://m.ykimg.com/0541010164F732752794D4D7B70331D1', + 'uploader_id': '88758207', + 'tags': [], + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UMzU1MDMyODI4', + 'uploader': '英美剧场', + 'duration': 72.91, + }, }] @staticmethod From 09f815ad52843219a7ee3f2a0dddf6c250c91f0c Mon Sep 17 00:00:00 2001 From: Stefan Lobbenmeier Date: Mon, 9 Oct 2023 19:51:37 +0200 Subject: [PATCH 450/501] [ie/ArteTV] Support age-restricted content (#8301) Closes #7782 Authored by: StefanLobbenmeier --- yt_dlp/extractor/arte.py | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index a19cd2a3ae..139a3a729f 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -48,17 +48,7 @@ class ArteTVIE(ArteTVBaseIE): }, { 'note': 'No alt_title', 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', - 'info_dict': { - 'id': '110371-000-A', - 'ext': 'mp4', - 'upload_date': '20220718', - 'duration': 154, - 'timestamp': 1658162460, - 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', - 'title': 'La chaleur, supplice des arbres de rue', - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', - }, - 'params': {'skip_download': 'm3u8'} + 'only_matching': True, }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, @@ -67,19 +57,20 @@ class ArteTVIE(ArteTVBaseIE): 'only_matching': True, }, { 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', + 'only_matching': True, + }, { + 'note': 'age-restricted', + 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/', 'info_dict': { - 'id': '110203-006-A', - 'chapters': 'count:16', - 'description': 'md5:cf592f1df52fe52007e3f8eac813c084', - 'alt_title': 'Zaz', - 'title': 'Baloise Session 2022', - 'timestamp': 1668445200, - 'duration': 4054, - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530', - 'upload_date': '20221114', + 'id': '006785-000-A', + 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba', + 'title': 'The Element of Crime', + 'timestamp': 1696111200, + 'duration': 5849, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530', + 'upload_date': '20230930', 'ext': 'mp4', - }, - 'expected_warnings': ['geo restricted'] + } }] _GEO_BYPASS = True @@ -136,7 +127,9 @@ def _real_extract(self, url): lang = mobj.group('lang') or mobj.group('lang_2') langauge_code = self._LANG_MAP.get(lang) - config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={ + 'x-validated-age': '18' + }) geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} if geoblocking.get('restrictedArea'): From 88a99c87b680ae59002534a517e191f46c42cbd4 Mon Sep 17 00:00:00 2001 From: Midnight Veil Date: Tue, 10 Oct 2023 04:55:46 +1100 Subject: [PATCH 451/501] [ie/tenplay] Add support for seasons (#7939) Closes #7744 Authored by: midnightveil --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/tenplay.py | 58 +++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 55c3c2f8e8..6717a6039f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1992,7 +1992,10 @@ WeTvSeriesIE, ) from .tennistv import TennisTVIE -from .tenplay import TenPlayIE +from .tenplay import ( + TenPlayIE, + TenPlaySeasonIE, +) from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index c7097cf025..7ce7cbf849 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,9 +1,11 @@ -from datetime import datetime import base64 +import functools +import itertools +from datetime import datetime from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, urlencode_postdata +from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin class TenPlayIE(InfoExtractor): @@ -113,3 +115,55 @@ def _real_extract(self, url): 'uploader': 'Channel 10', 'uploader_id': '2199827728001', } + + +class TenPlaySeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?P[^/?#]+)/episodes/(?P[^/?#]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://10play.com.au/masterchef/episodes/season-14', + 'info_dict': { + 'title': 'Season 14', + 'id': 'MjMyOTIy', + }, + 'playlist_mincount': 64, + }, { + 'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2022', + 'info_dict': { + 'title': 'Season 2022', + 'id': 'Mjc0OTIw', + }, + 'playlist_mincount': 256, + }] + + def _entries(self, load_more_url, display_id=None): + skip_ids = [] + for page in itertools.count(1): + episodes_carousel = self._download_json( + load_more_url, display_id, query={'skipIds[]': skip_ids}, + note=f'Fetching episodes page {page}') + + episodes_chunk = episodes_carousel['items'] + skip_ids.extend(ep['id'] for ep in episodes_chunk) + + for ep in episodes_chunk: + yield ep['cardLink'] + if not episodes_carousel['hasMore']: + break + + def _real_extract(self, url): + show, season = self._match_valid_url(url).group('show', 'season') + season_info = self._download_json( + f'https://10play.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}') + + episodes_carousel = traverse_obj(season_info, ( + 'content', 0, 'components', ( + lambda _, v: v['title'].lower() == 'episodes', + (..., {dict}), + )), get_all=False) or {} + + playlist_id = episodes_carousel['tpId'] + + return self.playlist_from_matches( + self._entries(urljoin(url, episodes_carousel['loadMoreUrl']), playlist_id), + playlist_id, traverse_obj(season_info, ('content', 0, 'title', {str})), + getter=functools.partial(urljoin, url)) From 4de94b9e165bfd6421a692f5f2eabcdb08edcb71 Mon Sep 17 00:00:00 2001 From: garret Date: Mon, 9 Oct 2023 19:00:26 +0100 Subject: [PATCH 452/501] [ie/nhk] Fix Japanese-language VOD extraction (#8309) Closes #8303 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 68 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index bcbc2279f6..f6b5c501bb 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -68,11 +68,12 @@ def _extract_formats_and_subtitles(self, vod_id): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None - lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if len(episode_id) == 7: + lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') + is_video = m_type == 'video' + + if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' if fetch_episode: episode = self._call_api( episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] @@ -133,47 +134,46 @@ def get_clean_field(key): class NhkVodIE(NhkBaseIE): # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = r'%s%s(?P[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?Pvideo)/(?P[0-9a-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}/(?Paudio)/(?P[^/?#]+?-\d{{8}}-[0-9a-z]+)'] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2049126/', 'info_dict': { - 'id': 'yd8322ch', + 'id': 'nw_vod_v_en_2049_126_20230413233000_01_1681398302', 'ext': 'mp4', - 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', - 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', - 'upload_date': '20230514', - 'timestamp': 1684083791, - 'series': 'GRAND SUMO Highlights', - 'episode': '[Recap] May Tournament Day 1 (Opening Day)', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', + 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', + 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', + 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', + 'series': 'Japan Railway Journal', }, }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'md5': '153c3016dfd252ba09726588149cf0e7', 'info_dict': { - 'id': 'a95j5iza', + 'id': 'lpZXIwaDE6_Z-976CPsFdxyICyWUzlT5', 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', }, }, { - # audio clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + # radio + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/livinginjapan-20231001-1/', 'info_dict': { - 'id': 'r_inventions-20201104-1-en', + 'id': 'livinginjapan-20231001-1-en', 'ext': 'm4a', - 'title': "Japan's Top Inventions - Miniature Video Cameras", - 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', + 'series': 'Living in Japan', + 'description': 'md5:850611969932874b4a3309e0cae06c2f', + 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', + 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, - 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -199,6 +199,19 @@ class NhkVodIE(NhkBaseIE): 'timestamp': 1623722008, }, 'skip': '404 Not Found', + }, { + # japanese-language, longer id than english + 'url': 'https://www3.nhk.or.jp/nhkworld/ja/ondemand/video/0020271111/', + 'info_dict': { + 'id': 'nw_ja_v_jvod_ohayou_20231008', + 'ext': 'mp4', + 'title': 'おはよう日本(7時台) - 10月8日放送', + 'series': 'おはよう日本(7時台)', + 'episode': '10月8日放送', + 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4', + 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', + }, + 'skip': 'expires 2023-10-15', }] def _real_extract(self, url): @@ -206,7 +219,7 @@ def _real_extract(self, url): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'%s/program%s(?P[0-9a-z]+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P\w+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' _TESTS = [{ # video program episodes 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', @@ -240,8 +253,7 @@ class NhkVodProgramIE(NhkBaseIE): }] def _real_extract(self, url): - lang, m_type, program_id, episode_type = self._match_valid_url(url).groups() - + lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( program_id, lang, m_type == 'video', False, episode_type == 'clip') From 84e26038d4002e763ea51ca1bdce4f7e63c540bf Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 9 Oct 2023 13:30:36 -0500 Subject: [PATCH 453/501] [utils] `write_xattr`: Use `os.setxattr` if available (#8205) Closes #8193 Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- README.md | 2 +- yt_dlp/utils/_utils.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a0b69c9a1a..a26482faaa 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ ### Metadata * [**mutagen**](https://github.com/quodlibet/mutagen)\* - For `--embed-thumbnail` in certain formats. Licensed under [GPLv2+](https://github.com/quodlibet/mutagen/blob/master/COPYING) * [**AtomicParsley**](https://github.com/wez/atomicparsley) - For `--embed-thumbnail` in `mp4`/`m4a` files when `mutagen`/`ffmpeg` cannot. Licensed under [GPLv2+](https://github.com/wez/atomicparsley/blob/master/COPYING) -* [**xattr**](https://github.com/xattr/xattr), [**pyxattr**](https://github.com/iustin/pyxattr) or [**setfattr**](http://savannah.nongnu.org/projects/attr) - For writing xattr metadata (`--xattr`) on **Linux**. Licensed under [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt), [LGPL2.1](https://github.com/iustin/pyxattr/blob/master/COPYING) and [GPLv2+](http://git.savannah.nongnu.org/cgit/attr.git/tree/doc/COPYING) respectively +* [**xattr**](https://github.com/xattr/xattr), [**pyxattr**](https://github.com/iustin/pyxattr) or [**setfattr**](http://savannah.nongnu.org/projects/attr) - For writing xattr metadata (`--xattr`) on **Mac** and **BSD**. Licensed under [MIT](https://github.com/xattr/xattr/blob/master/LICENSE.txt), [LGPL2.1](https://github.com/iustin/pyxattr/blob/master/COPYING) and [GPLv2+](http://git.savannah.nongnu.org/cgit/attr.git/tree/doc/COPYING) respectively ### Misc diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 3dc17bf593..10c7c43110 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -4441,10 +4441,12 @@ def write_xattr(path, key, value): raise XAttrMetadataError(e.errno, e.strerror) return - # UNIX Method 1. Use xattrs/pyxattrs modules + # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules setxattr = None - if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': + if callable(getattr(os, 'setxattr', None)): + setxattr = os.setxattr + elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': # Unicode arguments are not supported in pyxattr until version 0.5.0 # See https://github.com/ytdl-org/youtube-dl/issues/5498 if version_tuple(xattr.__version__) >= (0, 5, 0): From feebf6d02fc9651331eee2af5e08e6112288163b Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Thu, 12 Oct 2023 12:20:52 +0200 Subject: [PATCH 454/501] [ie/youtube] Fix bug with `--extractor-retries inf` (#8328) Authored by: Grub4K --- yt_dlp/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b7ac3e9cc1..c5be366362 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -947,7 +947,10 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers icd_rm = next(icd_retries) main_retries = iter(self.RetryManager()) main_rm = next(main_retries) - for _ in range(main_rm.retries + icd_rm.retries + 1): + # Manual retry loop for multiple RetryManagers + # The proper RetryManager MUST be advanced after an error + # and it's result MUST be checked if the manager is non fatal + while True: try: response = self._call_api( ep=ep, fatal=True, headers=headers, From b9316642313bbc9e209ac0d2276d37ba60bceb49 Mon Sep 17 00:00:00 2001 From: bashonly Date: Fri, 13 Oct 2023 14:23:39 -0500 Subject: [PATCH 455/501] [ie/radiko] Fix bug with `downloader_options` Closes #8333 Authored by: bashonly --- yt_dlp/extractor/radiko.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index 8c8fb1a8f9..c363d9ba5f 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -154,7 +154,7 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, sf['preference'] = -100 sf['format_note'] = 'not preferred' if not is_onair and timefree_int == 1 and time_to_skip: - sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} + sf['downloader_options'] = {'ffmpeg_args': ['-ss', str(time_to_skip)]} formats.extend(subformats) return formats From e030b6b6fba7b2f4614ad2ab9f7649d40a2dd305 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:29:56 +0200 Subject: [PATCH 456/501] [ie/mbn] Add extractor (#8312) Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mbn.py | 89 +++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 yt_dlp/extractor/mbn.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6717a6039f..45073628c8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1053,6 +1053,7 @@ from .massengeschmacktv import MassengeschmackTVIE from .masters import MastersIE from .matchtv import MatchTVIE +from .mbn import MBNIE from .mdr import MDRIE from .medaltv import MedalTVIE from .mediaite import MediaiteIE diff --git a/yt_dlp/extractor/mbn.py b/yt_dlp/extractor/mbn.py new file mode 100644 index 0000000000..4917c4698e --- /dev/null +++ b/yt_dlp/extractor/mbn.py @@ -0,0 +1,89 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MBNIE(InfoExtractor): + IE_DESC = 'mbn.co.kr (매일방송)' + _VALID_URL = r'https?://(?:www\.)?mbn\.co\.kr/vod/programContents/preview(?:list)?/\d+/\d+/(?P\d+)' + _TESTS = [{ + 'url': 'https://mbn.co.kr/vod/programContents/previewlist/861/5433/1276155', + 'md5': '85e1694e5b247c04d1386b7e3c90fd76', + 'info_dict': { + 'id': '1276155', + 'ext': 'mp4', + 'title': '결국 사로잡힌 권유리, 그녀를 목숨 걸고 구하려는 정일우!', + 'duration': 3891, + 'release_date': '20210703', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/861/2021/07/03/20210703230811_20_861_1276155_360_7_0.jpg', + 'series': '보쌈 - 운명을 훔치다', + 'episode': 'Episode 19', + 'episode_number': 19, + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/previewlist/835/5294/1084744', + 'md5': 'fc65d3aac85e85e0b5056f4ef99cde4a', + 'info_dict': { + 'id': '1084744', + 'ext': 'mp4', + 'title': '김정은♥최원영, 제자리를 찾은 위험한 부부! "결혼은 투쟁이면서, 어려운 방식이야.."', + 'duration': 93, + 'release_date': '20201124', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/835/2020/11/25/20201125000221_21_835_1084744_360_7_0.jpg', + 'series': '나의 위험한 아내', + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/preview/952/6088/1054797?next=1', + 'md5': 'c711103c72aeac8323a5cf1751f10097', + 'info_dict': { + 'id': '1054797', + 'ext': 'mp4', + 'title': '[2차 티저] MBN 주말 미니시리즈 <완벽한 결혼의 정석> l 그녀에게 주어진 두 번째 인생', + 'duration': 65, + 'release_date': '20231028', + 'thumbnail': 'http://img.vod.mbn.co.kr/vod2/952/2023/09/11/20230911130223_22_952_1054797_1080_7.jpg', + 'series': '완벽한 결혼의 정석', + }, + }] + + def _real_extract(self, url): + content_id = self._match_id(url) + webpage = self._download_webpage(url, content_id) + + content_cls_cd = self._search_regex( + r'"\?content_cls_cd=(\d+)&', webpage, 'content cls cd', fatal=False) or '20' + media_info = self._download_json( + 'https://www.mbn.co.kr/player/mbnVodPlayer_2020.mbn', content_id, + note='Fetching playback data', query={ + 'content_cls_cd': content_cls_cd, + 'content_id': content_id, + 'relay_type': '1', + }) + + formats = [] + for stream_url in traverse_obj(media_info, ('movie_list', ..., 'url', {url_or_none})): + stream_url = re.sub(r'/(?:chunk|play)list(?:_pd\d+)?\.m3u8', '/manifest.m3u8', stream_url) + final_url = url_or_none(self._download_webpage( + f'https://www.mbn.co.kr/player/mbnStreamAuth_new_vod.mbn?vod_url={stream_url}', + content_id, note='Fetching authenticated m3u8 url')) + + formats.extend(self._extract_m3u8_formats(final_url, content_id, fatal=False)) + + return { + 'id': content_id, + **traverse_obj(media_info, { + 'title': ('movie_title', {str}), + 'duration': ('play_sec', {int_or_none}), + 'release_date': ('bcast_date', {lambda x: x.replace('.', '')}, {unified_strdate}), + 'thumbnail': ('movie_start_Img', {url_or_none}), + 'series': ('prog_nm', {str}), + 'episode_number': ('ad_contentnumber', {int_or_none}), + }), + 'formats': formats, + } From b286ec68f1f28798b3e371f888a2ed97d399cf77 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:30:24 +0200 Subject: [PATCH 457/501] [ie/jtbc] Add extractors (#8314) Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/jtbc.py | 156 ++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 yt_dlp/extractor/jtbc.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 45073628c8..ca45711828 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -896,6 +896,10 @@ from .jove import JoveIE from .joj import JojIE from .jstream import JStreamIE +from .jtbc import ( + JTBCIE, + JTBCProgramIE, +) from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE diff --git a/yt_dlp/extractor/jtbc.py b/yt_dlp/extractor/jtbc.py new file mode 100644 index 0000000000..573f7492fe --- /dev/null +++ b/yt_dlp/extractor/jtbc.py @@ -0,0 +1,156 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class JTBCIE(InfoExtractor): + IE_DESC = 'jtbc.co.kr' + _VALID_URL = r'''(?x) + https?://(?: + vod\.jtbc\.co\.kr/player/(?:program|clip) + |tv\.jtbc\.co\.kr/(?:replay|trailer|clip)/pr\d+/pm\d+ + )/(?P(?:ep|vo)\d+)''' + _GEO_COUNTRIES = ['KR'] + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10011629/pm10067930/ep20216321/view', + 'md5': 'e6ade71d8c8685bbfd6e6ce4167c6a6c', + 'info_dict': { + 'id': 'VO10721192', + 'display_id': 'ep20216321', + 'ext': 'mp4', + 'title': '힘쎈여자 강남순 2회 다시보기', + 'description': 'md5:043c1d9019100ce271dba09995dbd1e2', + 'duration': 3770.0, + 'release_date': '20231008', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/drama/stronggirlnamsoon/img/20231008_163541_522_1.jpg', + 'series': '힘쎈여자 강남순', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/program/ep20216733', + 'md5': '217a6d190f115a75e4bda0ceaa4cd7f4', + 'info_dict': { + 'id': 'VO10721429', + 'display_id': 'ep20216733', + 'ext': 'mp4', + 'title': '헬로 마이 닥터 친절한 진료실 149회 다시보기', + 'description': 'md5:1d70788a982dd5de26874a92fcffddb8', + 'duration': 2720.0, + 'release_date': '20231009', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/culture/hellomydoctor/img/20231009_095002_528_1.jpg', + 'series': '헬로 마이 닥터 친절한 진료실', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/clip/vo10721270', + 'md5': '05782e2dc22a9c548aebefe62ae4328a', + 'info_dict': { + 'id': 'VO10721270', + 'display_id': 'vo10721270', + 'ext': 'mp4', + 'title': '뭉쳐야 찬다3 2회 예고편 - A매치로 향하는 마지막 관문💥', + 'description': 'md5:d48b51a8655c84843b4ed8d0c39aae68', + 'duration': 46.0, + 'release_date': '20231015', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/soccer3/img/20231008_210957_775_1.jpg', + 'series': '뭉쳐야 찬다3', + }, + }, { + 'url': 'https://tv.jtbc.co.kr/trailer/pr10010392/pm10032526/vo10720912/view', + 'md5': '367d480eb3ef54a9cd7a4b4d69c4b32d', + 'info_dict': { + 'id': 'VO10720912', + 'display_id': 'vo10720912', + 'ext': 'mp4', + 'title': '아는 형님 404회 예고편 | 10월 14일(토) 저녁 8시 50분 방송!', + 'description': 'md5:2743bb1079ceb85bb00060f2ad8f0280', + 'duration': 148.0, + 'release_date': '20231014', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/jtbcbros/img/20231006_230023_802_1.jpg', + 'series': '아는 형님', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + if display_id.startswith('vo'): + video_id = display_id.upper() + else: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-vod="(VO\d+)"', webpage, 'vod id') + + playback_data = self._download_json( + f'https://api.jtbc.co.kr/vod/{video_id}', video_id, note='Downloading VOD playback data') + + subtitles = {} + for sub in traverse_obj(playback_data, ('tracks', lambda _, v: v['file'])): + subtitles.setdefault(sub.get('label', 'und'), []).append({'url': sub['file']}) + + formats = [] + for stream_url in traverse_obj(playback_data, ('sources', 'HLS', ..., 'file', {url_or_none})): + stream_url = re.sub(r'/playlist(?:_pd\d+)?\.m3u8', '/index.m3u8', stream_url) + formats.extend(self._extract_m3u8_formats(stream_url, video_id, fatal=False)) + + metadata = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vod/detail', video_id, + note='Downloading mobile details', fatal=False, query={'vodFileId': video_id}) + return { + 'id': video_id, + 'display_id': display_id, + **traverse_obj(metadata, ('vodDetail', { + 'title': 'vodTitleView', + 'series': 'programTitle', + 'age_limit': ('watchAge', {int_or_none}), + 'release_date': ('broadcastDate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'description': 'episodeContents', + 'thumbnail': ('imgFileUrl', {url_or_none}), + })), + 'duration': parse_duration(playback_data.get('playTime')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class JTBCProgramIE(InfoExtractor): + IE_NAME = 'JTBC:program' + _VALID_URL = r'https?://(?:vod\.jtbc\.co\.kr/program|tv\.jtbc\.co\.kr/replay)/(?Ppr\d+)/(?:replay|pm\d+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10010392/pm10032710', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10010392', + }, + 'playlist_count': 398, + }, { + 'url': 'https://vod.jtbc.co.kr/program/pr10011491/replay', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10011491', + }, + 'playlist_count': 59, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + vod_list = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vodClip/programHome/programReplayVodList', program_id, + note='Downloading program replay list', query={ + 'programId': program_id, + 'rowCount': '10000', + }) + + entries = [self.url_result(f'https://vod.jtbc.co.kr/player/program/{video_id}', JTBCIE, video_id) + for video_id in traverse_obj(vod_list, ('programReplayVodList', ..., 'episodeId'))] + return self.playlist_result(entries, program_id) From 2acd1d555ef89851c73773776715d3de9a0e30b9 Mon Sep 17 00:00:00 2001 From: Riteo Date: Fri, 13 Oct 2023 22:01:39 +0200 Subject: [PATCH 458/501] [core] Ensure thumbnail output directory exists (#7985) Closes #8203 Authored by: Riteo --- yt_dlp/YoutubeDL.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f322b12a22..71d17ac01c 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4221,7 +4221,7 @@ def _write_subtitles(self, info_dict, filename): return ret def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): - ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) ''' + ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename); or None if error ''' write_all = self.params.get('write_all_thumbnails', False) thumbnails, ret = [], [] if write_all or self.params.get('writethumbnail', False): @@ -4237,6 +4237,9 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None self.write_debug(f'Skipping writing {label} thumbnail') return ret + if not self._ensure_dir_exists(filename): + return None + for idx, t in list(enumerate(thumbnails))[::-1]: thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') thumb_display_id = f'{label} thumbnail {t["id"]}' From b634ba742d8f38ce9ecfa0546485728b0c6c59d1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Oct 2023 17:15:35 -0500 Subject: [PATCH 459/501] [cleanup] Misc (#8338) Authored by: bashonly, gamer191 --- README.md | 3 +-- yt_dlp/extractor/banbye.py | 4 ++-- yt_dlp/extractor/breitbart.py | 2 +- yt_dlp/extractor/craftsy.py | 2 +- yt_dlp/extractor/cybrary.py | 4 ++-- yt_dlp/extractor/fifa.py | 2 +- yt_dlp/extractor/filmmodu.py | 2 +- yt_dlp/extractor/itprotv.py | 4 ++-- yt_dlp/extractor/jable.py | 4 ++-- yt_dlp/extractor/kommunetv.py | 2 +- yt_dlp/extractor/mainstreaming.py | 2 +- yt_dlp/extractor/mediaite.py | 2 +- yt_dlp/extractor/mocha.py | 2 +- yt_dlp/extractor/nfl.py | 4 ++-- yt_dlp/extractor/novaplay.py | 2 +- yt_dlp/extractor/nubilesporn.py | 2 +- yt_dlp/extractor/oftv.py | 4 ++-- yt_dlp/extractor/sina.py | 2 +- yt_dlp/extractor/twitter.py | 2 +- yt_dlp/extractor/utreon.py | 2 +- yt_dlp/extractor/vk.py | 4 ++-- yt_dlp/extractor/weverse.py | 12 ++++++------ yt_dlp/extractor/wimtv.py | 2 +- yt_dlp/extractor/xhamster.py | 4 ++-- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/extractor/zoom.py | 2 +- yt_dlp/options.py | 2 +- 27 files changed, 40 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index a26482faaa..dd4652d43a 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,6 @@ # NEW FEATURES * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) - * `255kbps` audio is extracted (if available) from YouTube Music when premium cookies are given * Channel URLs download all uploads of the channel, including shorts and live * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` @@ -913,7 +912,7 @@ ## Authentication Options: Defaults to ~/.netrc --netrc-cmd NETRC_CMD Command to execute to get the credentials for an extractor. - --video-password PASSWORD Video password (vimeo, youku) + --video-password PASSWORD Video-specific password --ap-mso MSO Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index e0fc93b973..dfcc82f021 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -31,7 +31,7 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P[\w-]+)' _TESTS = [{ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', @@ -120,7 +120,7 @@ def _real_extract(self, url): class BanByeChannelIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?channel/(?P\w+)' _TESTS = [{ 'url': 'https://banbye.com/channel/ch_wrealu24', 'info_dict': { diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index ea0a59c866..b5abb7f194 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -2,7 +2,7 @@ class BreitBartIE(InfoExtractor): - _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?breitbart\.com/videos/v/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py index 307bfb9460..5d3733143a 100644 --- a/yt_dlp/extractor/craftsy.py +++ b/yt_dlp/extractor/craftsy.py @@ -10,7 +10,7 @@ class CraftsyIE(InfoExtractor): - _VALID_URL = r'https?://www.craftsy.com/class/(?P[a-z0-9_-]+)/' + _VALID_URL = r'https?://www\.craftsy\.com/class/(?P[\w-]+)' _TESTS = [{ 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', 'info_dict': { diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py index 73f2439b31..aeffe93b41 100644 --- a/yt_dlp/extractor/cybrary.py +++ b/yt_dlp/extractor/cybrary.py @@ -45,7 +45,7 @@ def _get_vimeo_id(self, activity_id): class CybraryIE(CybraryBaseIE): - _VALID_URL = r'https?://app.cybrary.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' + _VALID_URL = r'https?://app\.cybrary\.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' _TESTS = [{ 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', 'md5': '9ae12d37e555cb2ed554223a71a701d0', @@ -110,7 +110,7 @@ def _real_extract(self, url): class CybraryCourseIE(CybraryBaseIE): - _VALID_URL = r'https://app.cybrary.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https://app\.cybrary\.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' _TESTS = [{ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', 'info_dict': { diff --git a/yt_dlp/extractor/fifa.py b/yt_dlp/extractor/fifa.py index 8b4db3a8ae..f604cbd40d 100644 --- a/yt_dlp/extractor/fifa.py +++ b/yt_dlp/extractor/fifa.py @@ -8,7 +8,7 @@ class FifaIE(InfoExtractor): - _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' + _VALID_URL = r'https?://www\.fifa\.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' _TESTS = [{ 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', 'info_dict': { diff --git a/yt_dlp/extractor/filmmodu.py b/yt_dlp/extractor/filmmodu.py index 9eb550eed5..1e793560d4 100644 --- a/yt_dlp/extractor/filmmodu.py +++ b/yt_dlp/extractor/filmmodu.py @@ -3,7 +3,7 @@ class FilmmoduIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?filmmodu.org/(?P[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' + _VALID_URL = r'https?://(?:www\.)?filmmodu\.org/(?P[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' _TESTS = [{ 'url': 'https://www.filmmodu.org/f9-altyazili-izle', 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4', diff --git a/yt_dlp/extractor/itprotv.py b/yt_dlp/extractor/itprotv.py index 4ac12603ae..b9d5c196d0 100644 --- a/yt_dlp/extractor/itprotv.py +++ b/yt_dlp/extractor/itprotv.py @@ -31,7 +31,7 @@ def _check_if_logged_in(self, webpage): class ITProTVIE(ITProTVBaseIE): - _VALID_URL = r'https://app.itpro.tv/course/(?P[\w-]+)/(?P[\w-]+)' + _VALID_URL = r'https://app\.itpro\.tv/course/(?P[\w-]+)/(?P[\w-]+)' _TESTS = [{ 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv', 'md5': 'bca4a28c2667fd1a63052e71a94bb88c', @@ -102,7 +102,7 @@ def _real_extract(self, url): class ITProTVCourseIE(ITProTVBaseIE): - _VALID_URL = r'https?://app.itpro.tv/course/(?P[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https?://app\.itpro\.tv/course/(?P[\w-]+)/?(?:$|[#?])' _TESTS = [ { 'url': 'https://app.itpro.tv/course/guided-tour', diff --git a/yt_dlp/extractor/jable.py b/yt_dlp/extractor/jable.py index 84c3225e48..71fed49ea0 100644 --- a/yt_dlp/extractor/jable.py +++ b/yt_dlp/extractor/jable.py @@ -10,7 +10,7 @@ class JableIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?jable\.tv/videos/(?P[\w-]+)' _TESTS = [{ 'url': 'https://jable.tv/videos/pppd-812/', 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', @@ -64,7 +64,7 @@ def _real_extract(self, url): class JablePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?jable\.tv/(?:categories|models|tags)/(?P[\w-]+)' _TESTS = [{ 'url': 'https://jable.tv/models/kaede-karen/', 'info_dict': { diff --git a/yt_dlp/extractor/kommunetv.py b/yt_dlp/extractor/kommunetv.py index e21e556be3..a30905b579 100644 --- a/yt_dlp/extractor/kommunetv.py +++ b/yt_dlp/extractor/kommunetv.py @@ -3,7 +3,7 @@ class KommunetvIE(InfoExtractor): - _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _VALID_URL = r'https://\w+\.kommunetv\.no/archive/(?P\w+)' _TEST = { 'url': 'https://oslo.kommunetv.no/archive/921', 'md5': '5f102be308ee759be1e12b63d5da4bbc', diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py index fe5589d598..fd9bba8bcb 100644 --- a/yt_dlp/extractor/mainstreaming.py +++ b/yt_dlp/extractor/mainstreaming.py @@ -13,7 +13,7 @@ class MainStreamingIE(InfoExtractor): - _VALID_URL = r'https?://(?:webtools-?)?(?P[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P\w+)' + _VALID_URL = r'https?://(?:webtools-?)?(?P[A-Za-z0-9-]*\.msvdn\.net)/(?:embed|amp_embed|content)/(?P\w+)' _EMBED_REGEX = [rf']+?src=["\']?(?P{_VALID_URL})["\']?'] IE_DESC = 'MainStreaming Player' diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py index ab253920b6..32887cbdef 100644 --- a/yt_dlp/extractor/mediaite.py +++ b/yt_dlp/extractor/mediaite.py @@ -2,7 +2,7 @@ class MediaiteIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}' + _VALID_URL = r'https?://(?:www\.)?mediaite\.com(?!/category)(?:/[\w-]+){2}' _TESTS = [{ 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/', 'info_dict': { diff --git a/yt_dlp/extractor/mocha.py b/yt_dlp/extractor/mocha.py index 5f72b810bb..2fbc0e9110 100644 --- a/yt_dlp/extractor/mocha.py +++ b/yt_dlp/extractor/mocha.py @@ -3,7 +3,7 @@ class MochaVideoIE(InfoExtractor): - _VALID_URL = r'https?://video.mocha.com.vn/(?P[\w-]+)' + _VALID_URL = r'https?://video\.mocha\.com\.vn/(?P[\w-]+)' _TESTS = [{ 'url': 'http://video.mocha.com.vn/chuyen-meo-gia-su-tu-thong-diep-cuoc-song-v18694039', 'info_dict': { diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index bd060dba9d..3f83cd20ef 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -247,7 +247,7 @@ def _real_extract(self, url): class NFLPlusReplayIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:replay' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/(?P[\w-]+)(?:/(?P\d+))?' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/games/(?P[\w-]+)(?:/(?P\d+))?' _TESTS = [{ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', 'info_dict': { @@ -342,7 +342,7 @@ def entries(): class NFLPlusEpisodeIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:episode' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/episodes/(?P[\w-]+)' _TESTS = [{ 'note': 'Subscription required', 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py index 92d1d136c7..d8849cd88d 100644 --- a/yt_dlp/extractor/novaplay.py +++ b/yt_dlp/extractor/novaplay.py @@ -3,7 +3,7 @@ class NovaPlayIE(InfoExtractor): - _VALID_URL = r'https://play.nova\.bg/video/.*/(?P\d+)' + _VALID_URL = r'https://play\.nova\.bg/video/[^?#]+/(?P\d+)' _TESTS = [ { 'url': 'https://play.nova.bg/video/ochakvaite/season-0/ochakvaite-2022-07-22-sybudi-se-sat/606627', diff --git a/yt_dlp/extractor/nubilesporn.py b/yt_dlp/extractor/nubilesporn.py index d4f1d9d67a..1d630f547d 100644 --- a/yt_dlp/extractor/nubilesporn.py +++ b/yt_dlp/extractor/nubilesporn.py @@ -19,7 +19,7 @@ class NubilesPornIE(InfoExtractor): _NETRC_MACHINE = 'nubiles-porn' _VALID_URL = r'''(?x) - https://members.nubiles-porn.com/video/watch/(?P\d+) + https://members\.nubiles-porn\.com/video/watch/(?P\d+) (?:/(?P[\w\-]+-s(?P\d+)e(?P\d+)))? ''' diff --git a/yt_dlp/extractor/oftv.py b/yt_dlp/extractor/oftv.py index 3ae7278fb9..4cac518463 100644 --- a/yt_dlp/extractor/oftv.py +++ b/yt_dlp/extractor/oftv.py @@ -4,7 +4,7 @@ class OfTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/video/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?of\.tv/video/(?P\w+)' _TESTS = [{ 'url': 'https://of.tv/video/627d7d95b353db0001dadd1a', 'md5': 'cb9cd5db3bb9ee0d32bfd7e373d6ef0a', @@ -34,7 +34,7 @@ def _real_extract(self, url): class OfTVPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/creators/(?P[a-zA-Z0-9-]+)/.?' + _VALID_URL = r'https?://(?:www\.)?of\.tv/creators/(?P[a-zA-Z0-9-]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://of.tv/creators/this-is-fire/', 'playlist_count': 8, diff --git a/yt_dlp/extractor/sina.py b/yt_dlp/extractor/sina.py index 9842811888..eeb9ebb44c 100644 --- a/yt_dlp/extractor/sina.py +++ b/yt_dlp/extractor/sina.py @@ -11,7 +11,7 @@ class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + _VALID_URL = r'''(?x)https?://(?:[^/?#]+\.)?video\.sina\.com\.cn/ (?: (?:view/|.*\#)(?P\d+)| .+?/(?P[^/?#]+)(?:\.s?html)| diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4065acbaaa..b6386214d9 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1741,7 +1741,7 @@ def _real_extract(self, url): class TwitterShortenerIE(TwitterBaseIE): IE_NAME = 'twitter:shortener' - _VALID_URL = r'https?://t.co/(?P[^?]+)|tco:(?P[^?]+)' + _VALID_URL = r'https?://t\.co/(?P[^?#]+)|tco:(?P[^?#]+)' _BASE_URL = 'https://t.co/' def _real_extract(self, url): diff --git a/yt_dlp/extractor/utreon.py b/yt_dlp/extractor/utreon.py index 90c10c051a..8a91691019 100644 --- a/yt_dlp/extractor/utreon.py +++ b/yt_dlp/extractor/utreon.py @@ -10,7 +10,7 @@ class UtreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?utreon.com/v/(?P[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?utreon\.com/v/(?P[\w-]+)' _TESTS = [{ 'url': 'https://utreon.com/v/z_I7ikQbuDw', 'info_dict': { diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 915422817a..c12e873623 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -97,12 +97,12 @@ class VKIE(VKBaseIE): (?: (?: (?:(?:m|new)\.)?vk\.com/video_| - (?:www\.)?daxab.com/ + (?:www\.)?daxab\.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)| - (?:www\.)?daxab.com/embed/ + (?:www\.)?daxab\.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P([\da-f]+)|(ln-[\da-zA-Z]+)))? ) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index bbf62856a6..47f36806bf 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -182,7 +182,7 @@ def _extract_live_status(self, data): class WeverseIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/live/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/live/0-107323480', 'md5': '1fa849f00181eef9100d3c8254c47979', @@ -344,7 +344,7 @@ def _real_extract(self, url): class WeverseMediaIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/4-116372884', 'md5': '8efc9cfd61b2f25209eb1a5326314d28', @@ -420,7 +420,7 @@ def _real_extract(self, url): class WeverseMomentIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', 'md5': '87733ac19a54081b7dfc2442036d282b', @@ -516,7 +516,7 @@ def _real_extract(self, url): class WeverseLiveTabIE(WeverseTabBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/live/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/billlie/live/', 'playlist_mincount': 55, @@ -534,7 +534,7 @@ class WeverseLiveTabIE(WeverseTabBaseIE): class WeverseMediaTabIE(WeverseTabBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/', 'playlist_mincount': 231, @@ -558,7 +558,7 @@ class WeverseMediaTabIE(WeverseTabBaseIE): class WeverseLiveIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/purplekiss', 'info_dict': { diff --git a/yt_dlp/extractor/wimtv.py b/yt_dlp/extractor/wimtv.py index 5711123903..f9bf092df5 100644 --- a/yt_dlp/extractor/wimtv.py +++ b/yt_dlp/extractor/wimtv.py @@ -11,7 +11,7 @@ class WimTVIE(InfoExtractor): _player = None _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _VALID_URL = r'''(?x: - https?://platform.wim.tv/ + https?://platform\.wim\.tv/ (?: (?:embed/)?\? |\#/webtv/.+?/ diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index aec1f20bb8..01ac5ddb65 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -24,7 +24,7 @@ class XHamsterIE(InfoExtractor): _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' _VALID_URL = r'''(?x) https?:// - (?:.+?\.)?%s/ + (?:[^/?#]+\.)?%s/ (?: movies/(?P[\dA-Za-z]+)/(?P[^/]*)\.html| videos/(?P[^/]*)-(?P[\dA-Za-z]+) @@ -372,7 +372,7 @@ def get_height(s): class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS + _VALID_URL = r'https?://(?:[^/?#]+\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1'] _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c5be366362..ac28ed7d28 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -949,7 +949,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers main_rm = next(main_retries) # Manual retry loop for multiple RetryManagers # The proper RetryManager MUST be advanced after an error - # and it's result MUST be checked if the manager is non fatal + # and its result MUST be checked if the manager is non fatal while True: try: response = self._call_api( diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index 1e41d04349..329ba1415e 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -13,7 +13,7 @@ class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?Pplay|share)/(?P[A-Za-z0-9_.-]+)' + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom\.us/)rec(?:ording)?/(?Pplay|share)/(?P[\w.-]+)' _TESTS = [{ 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 163809706a..85a6402a6d 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -727,7 +727,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, youku)') + help='Video-specific password') authentication.add_option( '--ap-mso', dest='ap_mso', metavar='MSO', From b73c4093187cffddcb6fbc4bfbdc0fea244ff1e9 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 13 Oct 2023 22:22:31 +0000 Subject: [PATCH 460/501] Release 2023.10.13 Created by: bashonly :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 +++---- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 +++---- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 +++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 +++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 +++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 +++---- CONTRIBUTORS | 4 ++++ Changelog.md | 24 +++++++++++++++++++ supportedsites.md | 4 ++++ yt_dlp/version.py | 4 ++-- 10 files changed, 58 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index dacb41758d..6c713e5a83 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ec6e298a19..e20036ce8d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index cf3cdd21f3..a9845b6b83 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 1bbcf68956..d3d60a11e5 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index d3bc06e809..57de148d04 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 30311d5b56..7b55a7427b 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.13** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.13 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + Latest version: 2023.10.13, Current version: 2023.10.13 + yt-dlp is up to date (2023.10.13) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 8eda413072..3035ee2961 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -509,3 +509,7 @@ handlerug jiru madewokherd xofe +awalgarg +midnightveil +naginatana +Riteo diff --git a/Changelog.md b/Changelog.md index 48dcbf1029..6f45eab2f2 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,30 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.10.13 + +#### Core changes +- [Ensure thumbnail output directory exists](https://github.com/yt-dlp/yt-dlp/commit/2acd1d555ef89851c73773776715d3de9a0e30b9) ([#7985](https://github.com/yt-dlp/yt-dlp/issues/7985)) by [Riteo](https://github.com/Riteo) +- **utils** + - `js_to_json`: [Fix `Date` constructor parsing](https://github.com/yt-dlp/yt-dlp/commit/9d7ded6419089c1bf252496073f73ad90ed71004) ([#8295](https://github.com/yt-dlp/yt-dlp/issues/8295)) by [awalgarg](https://github.com/awalgarg), [Grub4K](https://github.com/Grub4K) + - `write_xattr`: [Use `os.setxattr` if available](https://github.com/yt-dlp/yt-dlp/commit/84e26038d4002e763ea51ca1bdce4f7e63c540bf) ([#8205](https://github.com/yt-dlp/yt-dlp/issues/8205)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **artetv**: [Support age-restricted content](https://github.com/yt-dlp/yt-dlp/commit/09f815ad52843219a7ee3f2a0dddf6c250c91f0c) ([#8301](https://github.com/yt-dlp/yt-dlp/issues/8301)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **jtbc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b286ec68f1f28798b3e371f888a2ed97d399cf77) ([#8314](https://github.com/yt-dlp/yt-dlp/issues/8314)) by [seproDev](https://github.com/seproDev) +- **mbn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e030b6b6fba7b2f4614ad2ab9f7649d40a2dd305) ([#8312](https://github.com/yt-dlp/yt-dlp/issues/8312)) by [seproDev](https://github.com/seproDev) +- **nhk**: [Fix Japanese-language VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/4de94b9e165bfd6421a692f5f2eabcdb08edcb71) ([#8309](https://github.com/yt-dlp/yt-dlp/issues/8309)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Fix bug with `downloader_options`](https://github.com/yt-dlp/yt-dlp/commit/b9316642313bbc9e209ac0d2276d37ba60bceb49) by [bashonly](https://github.com/bashonly) +- **tenplay**: [Add support for seasons](https://github.com/yt-dlp/yt-dlp/commit/88a99c87b680ae59002534a517e191f46c42cbd4) ([#7939](https://github.com/yt-dlp/yt-dlp/issues/7939)) by [midnightveil](https://github.com/midnightveil) +- **youku**: [Improve tudou.com support](https://github.com/yt-dlp/yt-dlp/commit/b7098d46b552a9322c6cea39ba80be5229f922de) ([#8160](https://github.com/yt-dlp/yt-dlp/issues/8160)) by [naginatana](https://github.com/naginatana) +- **youtube**: [Fix bug with `--extractor-retries inf`](https://github.com/yt-dlp/yt-dlp/commit/feebf6d02fc9651331eee2af5e08e6112288163b) ([#8328](https://github.com/yt-dlp/yt-dlp/issues/8328)) by [Grub4K](https://github.com/Grub4K) + +#### Downloader changes +- **fragment**: [Improve progress calculation](https://github.com/yt-dlp/yt-dlp/commit/1c51c520f7b511ebd9e4eb7322285a8c31eedbbd) ([#8241](https://github.com/yt-dlp/yt-dlp/issues/8241)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [b634ba7](https://github.com/yt-dlp/yt-dlp/commit/b634ba742d8f38ce9ecfa0546485728b0c6c59d1) by [bashonly](https://github.com/bashonly), [gamer191](https://github.com/gamer191) + ### 2023.10.07 #### Extractor changes diff --git a/supportedsites.md b/supportedsites.md index ecef4dc2d1..0ab61d68d0 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -657,6 +657,8 @@ # Supported sites - **Joj** - **Jove** - **JStream** + - **JTBC**: jtbc.co.kr + - **JTBC:program** - **JWPlatform** - **Kakao** - **Kaltura** @@ -766,6 +768,7 @@ # Supported sites - **massengeschmack.tv** - **Masters** - **MatchTV** + - **MBN**: mbn.co.kr (매일방송) - **MDR**: MDR.DE and KiKA - **MedalTV** - **media.ccc.de** @@ -1468,6 +1471,7 @@ # Supported sites - **Tempo** - **TennisTV**: [*tennistv*](## "netrc machine") - **TenPlay**: [*10play*](## "netrc machine") + - **TenPlaySeason** - **TF1** - **TFO** - **TheHoleTv** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 60c1c94cc3..9d00963162 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.10.07' +__version__ = '2023.10.13' -RELEASE_GIT_HEAD = '377e85a1797db9e98b78b38203ed9d4ded229991' +RELEASE_GIT_HEAD = 'b634ba742d8f38ce9ecfa0546485728b0c6c59d1' VARIANT = None From 700444c23ddb65f618c2abd942acdc0c58c650b1 Mon Sep 17 00:00:00 2001 From: bashonly Date: Fri, 13 Oct 2023 18:02:06 -0500 Subject: [PATCH 461/501] [ci] Run core tests with dependencies Authored by: bashonly, coletdjnz --- .github/workflows/core.yml | 2 +- devscripts/make_changelog.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7fcf11dfa2..7acaee1e83 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -33,7 +33,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install pytest - run: pip install pytest + run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False run: | diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 9ff65db146..d0e893e581 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -56,6 +56,7 @@ def subgroup_lookup(cls): }, cls.MISC: { 'build', + 'ci', 'cleanup', 'devscripts', 'docs', From 8a8b54523addf46dfd50ef599761a81bc22362e6 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 14 Oct 2023 12:33:00 +1300 Subject: [PATCH 462/501] [rh:requests] Add handler for `requests` HTTP library (#3668) Adds support for HTTPS proxies and persistent connections (keep-alive) Closes https://github.com/yt-dlp/yt-dlp/issues/1890 Resolves https://github.com/yt-dlp/yt-dlp/issues/4070 Resolves https://github.com/ytdl-org/youtube-dl/issues/32549 Resolves https://github.com/ytdl-org/youtube-dl/issues/14523 Resolves https://github.com/ytdl-org/youtube-dl/issues/13734 Authored by: coletdjnz, Grub4K, bashonly --- .github/workflows/core.yml | 2 +- README.md | 4 +- requirements.txt | 2 + setup.py | 9 +- test/test_networking.py | 168 +++++++++--- test/test_socks.py | 36 +-- yt_dlp/YoutubeDL.py | 7 +- yt_dlp/__pyinstaller/hook-yt_dlp.py | 4 +- yt_dlp/dependencies/__init__.py | 9 + yt_dlp/networking/__init__.py | 10 + yt_dlp/networking/_helper.py | 20 +- yt_dlp/networking/_requests.py | 398 ++++++++++++++++++++++++++++ yt_dlp/networking/_urllib.py | 26 +- yt_dlp/options.py | 3 +- 14 files changed, 619 insertions(+), 79 deletions(-) create mode 100644 yt_dlp/networking/_requests.py diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7acaee1e83..049faf3738 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -32,7 +32,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pytest + - name: Install dependencies run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False diff --git a/README.md b/README.md index dd4652d43a..3b7432474d 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ ### Differences in default behavior * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is * yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. For ease of use, a few more compat options are available: @@ -164,7 +165,7 @@ ### Differences in default behavior * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler`. Use this to enable all future compat options # INSTALLATION @@ -274,6 +275,7 @@ ### Networking * [**certifi**](https://github.com/certifi/python-certifi)\* - Provides Mozilla's root certificate bundle. Licensed under [MPLv2](https://github.com/certifi/python-certifi/blob/master/LICENSE) * [**brotli**](https://github.com/google/brotli)\* or [**brotlicffi**](https://github.com/python-hyper/brotlicffi) - [Brotli](https://en.wikipedia.org/wiki/Brotli) content encoding support. Both licensed under MIT [1](https://github.com/google/brotli/blob/master/LICENSE) [2](https://github.com/python-hyper/brotlicffi/blob/master/LICENSE) * [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD-3-Clause](https://github.com/aaugustin/websockets/blob/main/LICENSE) +* [**requests**](https://github.com/psf/requests)\* - HTTP library. For HTTPS proxy and persistent connections support. Licensed under [Apache-2.0](https://github.com/psf/requests/blob/main/LICENSE) ### Metadata diff --git a/requirements.txt b/requirements.txt index dde37120f7..112c30aeb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ websockets brotli; platform_python_implementation=='CPython' brotlicffi; platform_python_implementation!='CPython' certifi +requests>=2.31.0,<3 +urllib3>=1.26.17,<3 \ No newline at end of file diff --git a/setup.py b/setup.py index a2f9f55c36..1740db27d8 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,14 @@ def py2exe_params(): 'compressed': 1, 'optimize': 2, 'dist_dir': './dist', - 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'excludes': [ + # py2exe cannot import Crypto + 'Crypto', + 'Cryptodome', + # py2exe appears to confuse this with our socks library. + # We don't use pysocks and urllib3.contrib.socks would fail to import if tried. + 'urllib3.contrib.socks' + ], 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], # Modules that are only imported dynamically must be added here 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', diff --git a/test/test_networking.py b/test/test_networking.py index 5308c8d6fa..2b45deac79 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -28,7 +28,7 @@ from test.helper import FakeYDL, http_server_port from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.dependencies import brotli +from yt_dlp.dependencies import brotli, requests, urllib3 from yt_dlp.networking import ( HEADRequest, PUTRequest, @@ -43,6 +43,7 @@ HTTPError, IncompleteRead, NoSupportingHandlers, + ProxyError, RequestError, SSLError, TransportError, @@ -305,7 +306,7 @@ def setup_class(cls): class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -316,7 +317,7 @@ def test_verify_cert(self, handler): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -334,7 +335,7 @@ def test_ssl_error(self, handler): validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -346,7 +347,7 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_remove_dot_segments(self, handler): with handler() as rh: # This isn't a comprehensive test, @@ -361,14 +362,14 @@ def test_remove_dot_segments(self, handler): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -378,7 +379,7 @@ def test_raise_http_error(self, handler): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -389,7 +390,7 @@ def test_response_url(self, handler): assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' res2.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect(self, handler): with handler() as rh: def do_req(redirect_status, method, assert_no_content=False): @@ -444,7 +445,7 @@ def do_req(redirect_status, method, assert_no_content=False): with pytest.raises(HTTPError): do_req(code, 'GET') - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -476,19 +477,19 @@ def test_request_cookie_header(self, handler): assert b'Cookie: test=ytdlp' not in data assert b'Cookie: test=test' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: with pytest.raises(IncompleteRead): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -505,7 +506,7 @@ def test_cookies(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'Cookie: test=ytdlp' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -521,7 +522,7 @@ def test_headers(self, handler): assert b'Test2: test2' not in data assert b'Test3: test3' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through @@ -537,7 +538,7 @@ def test_timeout(self, handler): validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' with handler(source_address=source_address) as rh: @@ -545,13 +546,13 @@ def test_source_address(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip_trailing_garbage(self, handler): with handler() as rh: data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() assert data == '