From 954e57e405f79188450eb30103a9308732cd318f Mon Sep 17 00:00:00 2001 From: bytedream <63594396+bytedream@users.noreply.github.com> Date: Sat, 6 Apr 2024 12:53:20 +0200 Subject: [PATCH 001/124] [ie/crunchyroll] Fix extractor (#9615) Authored by: bytedream --- README.md | 3 +- yt_dlp/extractor/crunchyroll.py | 143 +++++++++++++++++--------------- 2 files changed, 75 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index d4dd2c7be5..ee1b599900 100644 --- a/README.md +++ b/README.md @@ -1784,8 +1784,7 @@ #### funimation * `version`: The video version to extract - `uncut` or `simulcast` #### crunchyrollbeta (Crunchyroll) -* `format`: Which stream type(s) to extract (default: `adaptive_hls`). Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `download_dash`, `multitrack_adaptive_hls_v2` -* `hardsub`: Preference order for which hardsub versions to extract, or `all` (default: `None` = no hardsubs), e.g. `crunchyrollbeta:hardsub=en-US,None` +* `hardsub`: One or more hardsub versions to extract (in order of preference), or `all` (default: `None` = no hardsubs will be extracted), e.g. `crunchyrollbeta:hardsub=en-US,de-DE` #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index d35e9995ab..118b575ab2 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,4 +1,5 @@ import base64 +import uuid from .common import InfoExtractor from ..networking.exceptions import HTTPError @@ -7,12 +8,11 @@ float_or_none, format_field, int_or_none, - join_nonempty, + jwt_decode_hs256, parse_age_limit, parse_count, parse_iso8601, qualities, - remove_start, time_seconds, traverse_obj, url_or_none, @@ -27,6 +27,7 @@ class CrunchyrollBaseIE(InfoExtractor): _AUTH_HEADERS = None _API_ENDPOINT = None _BASIC_AUTH = None + _IS_PREMIUM = None _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q') _LOCALE_LOOKUP = { 'ar': 'ar-SA', @@ -84,11 +85,16 @@ def _update_auth(self): self.write_debug(f'Using cxApiParam={cx_api_param}') CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() - grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' + auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH} + if self.is_logged_in: + grant_type = 'etp_rt_cookie' + else: + grant_type = 'client_id' + auth_headers['ETP-Anonymous-ID'] = uuid.uuid4() try: auth_response = self._download_json( f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + headers=auth_headers, data=f'grant_type={grant_type}'.encode()) except ExtractorError as error: if isinstance(error.cause, HTTPError) and error.cause.status == 403: raise ExtractorError( @@ -97,6 +103,7 @@ def _update_auth(self): 'and your browser\'s User-Agent (with --user-agent)', expected=True) raise + CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) @@ -135,62 +142,72 @@ def _call_api(self, path, internal_id, lang, note='api', query={}): raise ExtractorError(f'Unexpected response when downloading {note} JSON') return result - def _extract_formats(self, stream_response, display_id=None): - requested_formats = self._configuration_arg('format') or ['vo_adaptive_hls'] - available_formats = {} - for stream_type, streams in traverse_obj( - stream_response, (('streams', ('data', 0)), {dict.items}, ...)): - if stream_type not in requested_formats: + def _extract_chapters(self, internal_id): + # if no skip events are available, a 403 xml error is returned + skip_events = self._download_json( + f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) + if not skip_events: + return None + + chapters = [] + for event in ('recap', 'intro', 'credits', 'preview'): + start = traverse_obj(skip_events, (event, 'start', {float_or_none})) + end = traverse_obj(skip_events, (event, 'end', {float_or_none})) + # some chapters have no start and/or ending time, they will just be ignored + if start is None or end is None: continue - for stream in traverse_obj(streams, lambda _, v: v['url']): - hardsub_lang = stream.get('hardsub_locale') or '' - format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) - available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end}) + + return chapters + + def _extract_stream(self, identifier, display_id=None): + if not display_id: + display_id = identifier + + self._update_auth() + stream_response = self._download_json( + f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', + display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS) + + available_formats = {'': ('', '', stream_response['url'])} + for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): + available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url']) requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - if '' in available_formats and 'all' not in requested_hardsubs: + hardsub_langs = [lang for lang in available_formats if lang] + if hardsub_langs and 'all' not in requested_hardsubs: full_format_langs = set(requested_hardsubs) + self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}') self.to_screen( - 'To get all formats of a hardsub language, use ' + 'To extract formats of a hardsub language, use ' '"--extractor-args crunchyrollbeta:hardsub=". ' 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info', only_once=True) else: full_format_langs = set(map(str.lower, available_formats)) - audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + audio_locale = traverse_obj(stream_response, ('audioLocale', {str})) hardsub_preference = qualities(requested_hardsubs[::-1]) - formats = [] - for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): - if stream_type.endswith('hls'): - if hardsub_lang.lower() in full_format_langs: - adaptive_formats = self._extract_m3u8_formats( - stream_url, display_id, 'mp4', m3u8_id=format_id, - fatal=False, note=f'Downloading {format_id} HLS manifest') - else: - adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) - elif stream_type.endswith('dash'): - adaptive_formats = self._extract_mpd_formats( - stream_url, display_id, mpd_id=format_id, - fatal=False, note=f'Downloading {format_id} MPD manifest') + formats, subtitles = [], {} + for format_id, hardsub_lang, stream_url in available_formats.values(): + if hardsub_lang.lower() in full_format_langs: + adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles( + stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS, + fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest') + self._merge_subtitles(dash_subs, target=subtitles) else: - self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) - continue + continue # XXX: Update this if/when meta mpd formats are working for f in adaptive_formats: if f.get('acodec') != 'none': f['language'] = audio_locale f['quality'] = hardsub_preference(hardsub_lang.lower()) formats.extend(adaptive_formats) - return formats + for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)): + subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})) - def _extract_subtitles(self, data): - subtitles = {} - - for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): - subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] - - return subtitles + return formats, subtitles class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): @@ -245,7 +262,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): 'like_count': int, 'dislike_count': int, }, - 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, + 'params': { + 'skip_download': 'm3u8', + 'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}}, + 'format': 'bv[format_id~=hardsub]', + }, }, { # Premium only 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', @@ -306,6 +327,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'no longer exists', }, { 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', 'info_dict': { @@ -359,31 +381,15 @@ def entries(): else: raise ExtractorError(f'Unknown object type {object_type}') - # There might be multiple audio languages for one object (`_metadata.versions`), - # so we need to get the id from `streams_link` instead or we dont know which language to choose - streams_link = response.get('streams_link') - if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): message = f'This {object_type} is for premium members only' if self.is_logged_in: raise ExtractorError(message, expected=True) self.raise_login_required(message) - # We need go from unsigned to signed api to avoid getting soft banned - stream_response = self._call_cms_api_signed(remove_start( - streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') - result['formats'] = self._extract_formats(stream_response, internal_id) - result['subtitles'] = self._extract_subtitles(stream_response) + result['formats'], result['subtitles'] = self._extract_stream(internal_id) - # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json( - f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - internal_id, note='Downloading chapter info', fatal=False, errnote=False) - if isinstance(intro_chapter, dict): - result['chapters'] = [{ - 'title': 'Intro', - 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')), - }] + result['chapters'] = self._extract_chapters(internal_id) def calculate_count(item): return parse_count(''.join((item['displayed'], item.get('unit') or ''))) @@ -512,7 +518,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'egaono-hana', 'title': 'Egaono Hana', 'track': 'Egaono Hana', - 'artist': 'Goose house', + 'artists': ['Goose house'], 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'genres': ['J-Pop'], }, @@ -525,11 +531,12 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'crossing-field', 'title': 'Crossing Field', 'track': 'Crossing Field', - 'artist': 'LiSA', + 'artists': ['LiSA'], 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'genres': ['Anime'], }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'no longer exists', }, { 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', 'info_dict': { @@ -538,7 +545,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', - 'artist': 'LiSA', + 'artists': ['LiSA'], 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'description': 'md5:747444e7e6300907b7a43f0a0503072e', 'genres': ['J-Pop'], @@ -566,16 +573,14 @@ def _real_extract(self, url): if not response: raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) - streams_link = response.get('streams_link') - if not streams_link and response.get('isPremiumOnly'): + if not self._IS_PREMIUM and response.get('isPremiumOnly'): message = f'This {response.get("type") or "media"} is for premium members only' if self.is_logged_in: raise ExtractorError(message, expected=True) self.raise_login_required(message) result = self._transform_music_response(response) - stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') - result['formats'] = self._extract_formats(stream_response, internal_id) + result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) return result @@ -587,7 +592,7 @@ def _transform_music_response(data): 'display_id': 'slug', 'title': 'title', 'track': 'title', - 'artist': ('artist', 'name'), + 'artists': ('artist', 'name', all), 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), 'thumbnails': ('images', ..., ..., { 'url': ('source', {url_or_none}), @@ -611,7 +616,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE): 'info_dict': { 'id': 'MA179CB50D', 'title': 'LiSA', - 'genres': ['J-Pop', 'Anime', 'Rock'], + 'genres': ['Anime', 'J-Pop', 'Rock'], 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', }, 'playlist_mincount': 83, From a48cc86d6f6b20427553620c2ddb990ede6a4b41 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 6 Apr 2024 12:19:44 -0500 Subject: [PATCH 002/124] [ie/dropbox] Fix formats extraction (#9627) Closes #9533 Authored by: bashonly --- yt_dlp/extractor/dropbox.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index bc2efce123..0246975c1f 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -65,12 +65,14 @@ def _real_extract(self, url): formats, subtitles, has_anonymous_download = [], {}, False for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') + if not has_anonymous_download: + has_anonymous_download = self._search_regex( + r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) transcode_url = self._search_regex( r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None) if not transcode_url: continue formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4') - has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) break # downloads enabled we can get the original file From 9415f1a5ef88482ebafe3083e8bcb778ac512df7 Mon Sep 17 00:00:00 2001 From: Tomoka1 <141353477+Tomoka1@users.noreply.github.com> Date: Sat, 6 Apr 2024 19:23:16 +0200 Subject: [PATCH 003/124] [ie/afreecatv] Overhaul extractor (#9566) Closes #4592, Closes #8862, Closes #9544 Authored by: bashonly, Tomoka1 Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/afreecatv.py | 231 +++++++++++----------------------- 1 file changed, 74 insertions(+), 157 deletions(-) diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 86e69a68ec..2c33c90dbb 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -1,20 +1,16 @@ import functools -import re from .common import InfoExtractor from ..utils import ( ExtractorError, OnDemandPagedList, UserNotLive, - date_from_str, determine_ext, filter_dict, int_or_none, - unified_strdate, unified_timestamp, url_or_none, urlencode_postdata, - xpath_text, ) from ..utils.traversal import traverse_obj @@ -76,7 +72,6 @@ class AfreecaTVIE(AfreecaTVBaseIE): ) (?P\d+) ''' - _NETRC_MACHINE = 'afreecatv' _TESTS = [{ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -129,6 +124,7 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'uploader': '♥이슬이', 'uploader_id': 'dasl8121', 'upload_date': '20170411', + 'timestamp': 1491929865, 'duration': 213, }, 'params': { @@ -162,176 +158,97 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'uploader_id': 'rlantnghks', 'uploader': '페이즈으', 'duration': 10840, - 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r', + 'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+', 'upload_date': '20230108', + 'timestamp': 1673218805, 'title': '젠지 페이즈', }, 'params': { 'skip_download': True, }, + }, { + # adult content + 'url': 'https://vod.afreecatv.com/player/70395877', + 'only_matching': True, + }, { + # subscribers only + 'url': 'https://vod.afreecatv.com/player/104647403', + 'only_matching': True, + }, { + # private + 'url': 'https://vod.afreecatv.com/player/81669846', + 'only_matching': True, }] - @staticmethod - def parse_video_key(key): - video_key = {} - m = re.match(r'^(?P\d{8})_\w+_(?P\d+)$', key) - if m: - video_key['upload_date'] = m.group('upload_date') - video_key['part'] = int(m.group('part')) - return video_key - def _real_extract(self, url): video_id = self._match_id(url) - - partial_view = False - adult_view = False - for _ in range(2): - data = self._download_json( - 'https://api.m.afreecatv.com/station/video/a/view', - video_id, headers={'Referer': url}, data=urlencode_postdata({ - 'nTitleNo': video_id, - 'nApiLevel': 10, - }))['data'] - if traverse_obj(data, ('code', {int})) == -6221: - raise ExtractorError('The VOD does not exist', expected=True) - query = { + data = self._download_json( + 'https://api.m.afreecatv.com/station/video/a/view', video_id, + headers={'Referer': url}, data=urlencode_postdata({ 'nTitleNo': video_id, - 'nStationNo': data['station_no'], - 'nBbsNo': data['bbs_no'], - } - if partial_view: - query['partialView'] = 'SKIP_ADULT' - if adult_view: - query['adultView'] = 'ADULT_VIEW' - video_xml = self._download_xml( - 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, 'Downloading video info XML%s' - % (' (skipping adult)' if partial_view else ''), - video_id, headers={ - 'Referer': url, - }, query=query) + 'nApiLevel': 10, + }))['data'] - flag = xpath_text(video_xml, './track/flag', 'flag', default=None) - if flag and flag == 'SUCCEED': - break - if flag == 'PARTIAL_ADULT': - self.report_warning( - 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' - 'Only content suitable for all ages will be downloaded. ' - 'Provide account credentials if you wish to download restricted content.') - partial_view = True - continue - elif flag == 'ADULT': - if not adult_view: - adult_view = True - continue - error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' - else: - error = flag - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - else: - raise ExtractorError('Unable to download video info') + error_code = traverse_obj(data, ('code', {int})) + if error_code == -6221: + raise ExtractorError('The VOD does not exist', expected=True) + elif error_code == -6205: + raise ExtractorError('This VOD is private', expected=True) - video_element = video_xml.findall('./track/video')[-1] - if video_element is None or video_element.text is None: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) - - video_url = video_element.text.strip() - - title = xpath_text(video_xml, './track/title', 'title', fatal=True) - - uploader = xpath_text(video_xml, './track/nickname', 'uploader') - uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') - duration = int_or_none(xpath_text( - video_xml, './track/duration', 'duration')) - thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') - - common_entry = { - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnail': thumbnail, - } - - info = common_entry.copy() - info.update({ - 'id': video_id, - 'title': title, - 'duration': duration, + common_info = traverse_obj(data, { + 'title': ('title', {str}), + 'uploader': ('writer_nick', {str}), + 'uploader_id': ('bj_id', {str}), + 'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}), + 'thumbnail': ('thumb', {url_or_none}), }) - if not video_url: - entries = [] - file_elements = video_element.findall('./file') - one = len(file_elements) == 1 - for file_num, file_element in enumerate(file_elements, start=1): - file_url = url_or_none(file_element.text) - if not file_url: - continue - key = file_element.get('key', '') - upload_date = unified_strdate(self._search_regex( - r'^(\d{8})_', key, 'upload date', default=None)) - if upload_date is not None: - # sometimes the upload date isn't included in the file name - # instead, another random ID is, which may parse as a valid - # date but be wildly out of a reasonable range - parsed_date = date_from_str(upload_date) - if parsed_date.year < 2000 or parsed_date.year >= 2100: - upload_date = None - file_duration = int_or_none(file_element.get('duration')) - format_id = key if key else '%s_%s' % (video_id, file_num) - if determine_ext(file_url) == 'm3u8': - formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', - note='Downloading part %d m3u8 information' % file_num) - else: - formats = [{ - 'url': file_url, - 'format_id': 'http', - }] - if not formats and not self.get_param('ignore_no_formats'): - continue - file_info = common_entry.copy() - file_info.update({ - 'id': format_id, - 'title': title if one else '%s (part %d)' % (title, file_num), - 'upload_date': upload_date, - 'duration': file_duration, - 'formats': formats, + entries = [] + for file_num, file_element in enumerate( + traverse_obj(data, ('files', lambda _, v: url_or_none(v['file']))), start=1): + file_url = file_element['file'] + if determine_ext(file_url) == 'm3u8': + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', m3u8_id='hls', + note=f'Downloading part {file_num} m3u8 information') + else: + formats = [{ + 'url': file_url, + 'format_id': 'http', + }] + + entries.append({ + **common_info, + 'id': file_element.get('file_info_key') or f'{video_id}_{file_num}', + 'title': f'{common_info.get("title") or "Untitled"} (part {file_num})', + 'formats': formats, + **traverse_obj(file_element, { + 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), + 'timestamp': ('file_start', {unified_timestamp}), }) - entries.append(file_info) - entries_info = info.copy() - entries_info.update({ - '_type': 'multi_video', - 'entries': entries, - }) - return entries_info - - info = { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - } - - if determine_ext(video_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - app, playpath = video_url.split('mp4:') - info.update({ - 'url': app, - 'ext': 'flv', - 'play_path': 'mp4:' + playpath, - 'rtmp_live': True, # downloading won't end without this }) - return info + if traverse_obj(data, ('adult_status', {str})) == 'notLogin': + if not entries: + self.raise_login_required( + 'Only users older than 19 are able to watch this video', method='password') + self.report_warning( + 'In accordance with local laws and regulations, underage users are ' + 'restricted from watching adult content. Only content suitable for all ' + f'ages will be downloaded. {self._login_hint("password")}') + + if not entries and traverse_obj(data, ('sub_upload_type', {str})): + self.raise_login_required('This VOD is for subscribers only', method='password') + + if len(entries) == 1: + return { + **entries[0], + 'title': common_info.get('title'), + } + + common_info['timestamp'] = traverse_obj(entries, (..., 'timestamp'), get_all=False) + + return self.playlist_result(entries, video_id, multi_video=True, **common_info) class AfreecaTVLiveIE(AfreecaTVBaseIE): From f2fd449b46c4058222e1744f7a35caa20b2d003d Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Sat, 6 Apr 2024 17:34:51 +0000 Subject: [PATCH 004/124] [ie/joqrag] Fix live status detection (#9624) Authored by: pzhlkj6612 --- yt_dlp/extractor/joqrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py index c68ad8cb5f..7a91d4a235 100644 --- a/yt_dlp/extractor/joqrag.py +++ b/yt_dlp/extractor/joqrag.py @@ -80,7 +80,7 @@ def _real_extract(self, url): note='Downloading metadata', errnote='Failed to download metadata') title = self._extract_metadata('Program_name', metadata) - if title == '放送休止': + if not title or title == '放送休止': formats = [] live_status = 'is_upcoming' release_timestamp = self._extract_start_timestamp(video_id, False) From c8a61a910096c77ce08dad5e1b2fbda5eb964156 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 6 Apr 2024 12:42:32 -0500 Subject: [PATCH 005/124] [ie/kick] Support browser impersonation (#9611) Closes #6748 Authored by: bashonly --- yt_dlp/extractor/kick.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index d124372424..889548f526 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -13,7 +13,8 @@ class KickBaseIE(InfoExtractor): def _real_initialize(self): - self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False) + self._request_webpage( + HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False, impersonate=True) xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') if not xsrf_token: self.write_debug('kick.com did not set XSRF-TOKEN cookie') @@ -25,7 +26,7 @@ def _real_initialize(self): def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): return self._download_json( f'https://kick.com/api/v1/{path}', display_id, note=note, - headers=merge_dicts(headers, self._API_HEADERS), **kwargs) + headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs) class KickIE(KickBaseIE): @@ -82,26 +83,27 @@ def _real_extract(self, url): class KickVODIE(KickBaseIE): _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35', - 'md5': '73691206a6a49db25c5aa1588e6538fc', + 'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3', + 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': '54244b5e-050a-4df4-a013-b2433dafbe35', + 'id': '58bac65b-e641-4476-a7ba-3707a35e60e3', 'ext': 'mp4', - 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links', - 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f', - 'channel': 'kmack710', - 'channel_id': '16278', - 'uploader': 'Kmack710', - 'uploader_id': '16412', - 'upload_date': '20221206', - 'timestamp': 1670318289, - 'duration': 40104.0, + 'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠', + 'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d', + 'channel': 'jaredfps', + 'channel_id': '26608', + 'uploader': 'JaredFPS', + 'uploader_id': '26799', + 'upload_date': '20240402', + 'timestamp': 1712097108, + 'duration': 33859.0, 'thumbnail': r're:^https?://.*\.jpg', - 'categories': ['Grand Theft Auto V'], + 'categories': ['Call of Duty: Warzone'], }, 'params': { 'skip_download': 'm3u8', }, + 'expected_warnings': [r'impersonation'], }] def _real_extract(self, url): From b15b0c1d2106437ec61a5c436c543e8760eac160 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 6 Apr 2024 15:42:51 -0500 Subject: [PATCH 006/124] [ie/vkplay] Fix `_VALID_URL` (#9636) Closes #9635 Authored by: bashonly --- yt_dlp/extractor/vk.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index e4a78c2977..7e3a3a9a98 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -707,6 +707,7 @@ def _real_extract(self, url): class VKPlayBaseIE(InfoExtractor): + _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vkplay\.ru)/' _RESOLUTIONS = { 'tiny': '256x144', 'lowest': '426x240', @@ -765,7 +766,7 @@ def _extract_common_meta(self, stream_info): class VKPlayIE(VKPlayBaseIE): - _VALID_URL = r'https?://vkplay\.live/(?P[^/#?]+)/record/(?P[a-f0-9-]+)' + _VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P[^/#?]+)/record/(?P[\da-f-]+)' _TESTS = [{ 'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da', 'info_dict': { @@ -776,13 +777,16 @@ class VKPlayIE(VKPlayBaseIE): 'uploader_id': '13159830', 'release_timestamp': 1683461378, 'release_date': '20230507', - 'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+', + 'thumbnail': r're:https://[^/]+/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview', 'duration': 10608, 'view_count': int, 'like_count': int, 'categories': ['Atomic Heart'], }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://live.vkplay.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records', + 'only_matching': True, }] def _real_extract(self, url): @@ -802,7 +806,7 @@ def _real_extract(self, url): class VKPlayLiveIE(VKPlayBaseIE): - _VALID_URL = r'https?://vkplay\.live/(?P[^/#?]+)/?(?:[#?]|$)' + _VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P[^/#?]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://vkplay.live/bayda', 'info_dict': { @@ -813,7 +817,7 @@ class VKPlayLiveIE(VKPlayBaseIE): 'uploader_id': '12279401', 'release_timestamp': 1687209962, 'release_date': '20230619', - 'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+', + 'thumbnail': r're:https://[^/]+/public_video_stream/12279401/preview', 'view_count': int, 'concurrent_view_count': int, 'like_count': int, @@ -822,6 +826,9 @@ class VKPlayLiveIE(VKPlayBaseIE): }, 'skip': 'livestream', 'params': {'skip_download': True}, + }, { + 'url': 'https://live.vkplay.ru/lebwa', + 'only_matching': True, }] def _real_extract(self, url): From 2ab2651a4a7be18939e2b4cb21be79fe477c797a Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Sun, 7 Apr 2024 18:28:59 +0300 Subject: [PATCH 007/124] [cookies] Add `--cookies-from-browser` support for Firefox Flatpak (#9619) Authored by: un-def --- yt_dlp/cookies.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 85d6dd1823..7b8d215f03 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -194,7 +194,11 @@ def _firefox_browser_dirs(): yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles') else: - yield from map(os.path.expanduser, ('~/.mozilla/firefox', '~/snap/firefox/common/.mozilla/firefox')) + yield from map(os.path.expanduser, ( + '~/.mozilla/firefox', + '~/snap/firefox/common/.mozilla/firefox', + '~/.var/app/org.mozilla.firefox/.mozilla/firefox', + )) def _firefox_cookie_dbs(roots): From fc53ec13ff1ee926a3e533a68cfca8acc887b661 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 7 Apr 2024 10:32:11 -0500 Subject: [PATCH 008/124] [ie/tiktok] Restore `carrier_region` API parameter (#9637) Avoids some geo-blocks Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 295e14932a..3f5261ad96 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -155,6 +155,7 @@ def _build_api_query(self, query): 'locale': 'en', 'ac2': 'wifi5g', 'uoo': '1', + 'carrier_region': 'US', 'op_region': 'US', 'build_number': self._APP_INFO['app_version'], 'region': 'US', From 36b240f9a72af57eb2c9d927ebb7fd1c917ebf18 Mon Sep 17 00:00:00 2001 From: John Victor <37747572+johnvictorfs@users.noreply.github.com> Date: Sun, 7 Apr 2024 13:26:44 -0300 Subject: [PATCH 009/124] [ie/patreon] Do not extract dead embed URLs (#9613) Closes #8702 Authored by: johnvictorfs --- yt_dlp/extractor/patreon.py | 44 +++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index d2ddb72cd4..d4f822f52d 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -92,7 +92,7 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': 're:^https?://.*$', 'upload_date': '20150211', 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', - 'uploader_id': 'TraciJHines', + 'uploader_id': '@TraciHinesMusic', 'categories': ['Entertainment'], 'duration': 282, 'view_count': int, @@ -106,8 +106,10 @@ class PatreonIE(PatreonBaseIE): 'availability': 'public', 'channel_follower_count': int, 'playable_in_embed': True, - 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic', 'comment_count': int, + 'channel_is_verified': True, + 'chapters': 'count:4', }, 'params': { 'noplaylist': True, @@ -176,6 +178,27 @@ class PatreonIE(PatreonBaseIE): 'uploader_url': 'https://www.patreon.com/thenormies', }, 'skip': 'Patron-only content', + }, { + # dead vimeo and embed URLs, need to extract post_file + 'url': 'https://www.patreon.com/posts/hunter-x-hunter-34007913', + 'info_dict': { + 'id': '34007913', + 'ext': 'mp4', + 'title': 'Hunter x Hunter | Kurapika DESTROYS Uvogin!!!', + 'like_count': int, + 'uploader': 'YaBoyRoshi', + 'timestamp': 1581636833, + 'channel_url': 'https://www.patreon.com/yaboyroshi', + 'thumbnail': r're:^https?://.*$', + 'tags': ['Hunter x Hunter'], + 'uploader_id': '14264111', + 'comment_count': int, + 'channel_follower_count': int, + 'description': 'Kurapika is a walking cheat code!', + 'upload_date': '20200213', + 'channel_id': '2147162', + 'uploader_url': 'https://www.patreon.com/yaboyroshi', + }, }] def _real_extract(self, url): @@ -250,20 +273,13 @@ def _real_extract(self, url): v_url = url_or_none(compat_urllib_parse_unquote( self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) if v_url: - return { - **info, - '_type': 'url_transparent', - 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'), - 'ie_key': 'Vimeo', - } + v_url = VimeoIE._smuggle_referrer(v_url, 'https://patreon.com') + if self._request_webpage(v_url, video_id, 'Checking Vimeo embed URL', fatal=False, errnote=False): + return self.url_result(v_url, VimeoIE, url_transparent=True, **info) embed_url = try_get(attributes, lambda x: x['embed']['url']) - if embed_url: - return { - **info, - '_type': 'url', - 'url': embed_url, - } + if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): + return self.url_result(embed_url, **info) post_file = traverse_obj(attributes, 'post_file') if post_file: From 4af9d5c2f6aa81403ae2a8a5ae3cc824730f0b86 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 7 Apr 2024 11:59:38 -0500 Subject: [PATCH 010/124] [ie/nhk] Fix NHK World extractors (#9623) Closes #9513 Authored by: bashonly --- yt_dlp/extractor/nhk.py | 200 +++++++++++++++++++++++++++++++--------- 1 file changed, 158 insertions(+), 42 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 7cf5b246b1..8bb017a732 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -8,6 +8,7 @@ int_or_none, join_nonempty, parse_duration, + remove_end, traverse_obj, try_call, unescapeHTML, @@ -19,8 +20,7 @@ class NhkBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' - _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand' - _TYPE_REGEX = r'/(?Pvideo|audio)/' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/' def _call_api(self, m_id, lang, is_video, is_episode, is_clip): return self._download_json( @@ -83,7 +83,7 @@ def _extract_stream_info(self, vod_id): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') - is_video = m_type == 'video' + is_video = m_type != 'audio' if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] @@ -138,9 +138,10 @@ def get_clean_field(key): else: if fetch_episode: - audio_path = episode['audio']['audio'] + # From https://www3.nhk.or.jp/nhkworld/common/player/radio/inline/rod.html + audio_path = remove_end(episode['audio']['audio'], '.m4a') info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + f'{urljoin("https://vod-stream.nhk.jp", audio_path)}/index.m3u8', episode_id, 'm4a', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) for f in info['formats']: @@ -155,9 +156,11 @@ def get_clean_field(key): class NhkVodIE(NhkBaseIE): - # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?Pvideo)/(?P[0-9a-z]+)', - rf'{NhkBaseIE._BASE_URL_REGEX}/(?Paudio)/(?P[^/?#]+?-\d{{8}}-[0-9a-z]+)'] + _VALID_URL = [ + rf'{NhkBaseIE._BASE_URL_REGEX}shows/(?:(?Pvideo)/)?(?P\d{{4}}[\da-z]\d+)/?(?:$|[?#])', + rf'{NhkBaseIE._BASE_URL_REGEX}(?:ondemand|shows)/(?Paudio)/(?P[^/?#]+?-\d{{8}}-[\da-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}ondemand/(?Pvideo)/(?P\d{{4}}[\da-z]\d+)', # deprecated + ] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -167,17 +170,16 @@ class NhkVodIE(NhkBaseIE): 'ext': 'mp4', 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', - 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', + 'thumbnail': r're:https://.+/.+\.jpg', 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', 'series': 'Japan Railway Journal', - 'modified_timestamp': 1694243656, + 'modified_timestamp': 1707217907, 'timestamp': 1681428600, 'release_timestamp': 1693883728, 'duration': 1679, 'upload_date': '20230413', - 'modified_date': '20230909', + 'modified_date': '20240206', 'release_date': '20230905', - }, }, { # video clip @@ -188,15 +190,15 @@ class NhkVodIE(NhkBaseIE): 'ext': 'mp4', 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', + 'thumbnail': r're:https://.+/.+\.jpg', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', 'duration': 148, 'upload_date': '20190816', 'release_date': '20230902', 'release_timestamp': 1693619292, - 'modified_timestamp': 1694168033, - 'modified_date': '20230908', + 'modified_timestamp': 1707217907, + 'modified_date': '20240206', 'timestamp': 1565997540, }, }, { @@ -208,7 +210,7 @@ class NhkVodIE(NhkBaseIE): 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', 'series': 'Living in Japan', 'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab', - 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', + 'thumbnail': r're:https://.+/.+\.jpg', 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, }, { @@ -245,7 +247,7 @@ class NhkVodIE(NhkBaseIE): 'title': 'おはよう日本(7時台) - 10月8日放送', 'series': 'おはよう日本(7時台)', 'episode': '10月8日放送', - 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4', + 'thumbnail': r're:https://.+/.+\.jpg', 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', }, 'skip': 'expires 2023-10-15', @@ -255,17 +257,100 @@ class NhkVodIE(NhkBaseIE): 'info_dict': { 'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552', 'ext': 'mp4', - 'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island', + 'title': 'Barakan Discovers - AMAMI OSHIMA: Isson\'s Treasure Isla', 'description': 'md5:5db620c46a0698451cc59add8816b797', - 'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd', + 'thumbnail': r're:https://.+/.+\.jpg', 'release_date': '20230905', 'timestamp': 1690103400, 'duration': 2939, 'release_timestamp': 1693898699, - 'modified_timestamp': 1698057495, - 'modified_date': '20231023', 'upload_date': '20230723', + 'modified_timestamp': 1707217907, + 'modified_date': '20240206', + 'episode': 'AMAMI OSHIMA: Isson\'s Treasure Isla', + 'series': 'Barakan Discovers', }, + }, { + # /ondemand/video/ url with alphabetical character in 5th position of id + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a07/', + 'info_dict': { + 'id': 'nw_c_en_9999-a07', + 'ext': 'mp4', + 'episode': 'Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]', + 'series': 'Mini-Dramas on SDGs', + 'modified_date': '20240206', + 'title': 'Mini-Dramas on SDGs - Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]', + 'description': 'md5:3f9dcb4db22fceb675d90448a040d3f6', + 'timestamp': 1621962360, + 'duration': 189, + 'release_date': '20230903', + 'modified_timestamp': 1707217907, + 'upload_date': '20210525', + 'thumbnail': r're:https://.+/.+\.jpg', + 'release_timestamp': 1693713487, + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999d17/', + 'info_dict': { + 'id': 'nw_c_en_9999-d17', + 'ext': 'mp4', + 'title': 'Flowers of snow blossom - The 72 Pentads of Yamato', + 'description': 'Today’s focus: Snow', + 'release_timestamp': 1693792402, + 'release_date': '20230904', + 'upload_date': '20220128', + 'timestamp': 1643370960, + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 136, + 'series': '', + 'modified_date': '20240206', + 'modified_timestamp': 1707217907, + }, + }, { + # new /shows/ url format + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/2032307/', + 'info_dict': { + 'id': 'nw_vod_v_en_2032_307_20240321113000_01_1710990282', + 'ext': 'mp4', + 'title': 'Japanology Plus - 20th Anniversary Special Part 1', + 'description': 'md5:817d41fc8e54339ad2a916161ea24faf', + 'episode': '20th Anniversary Special Part 1', + 'series': 'Japanology Plus', + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 1680, + 'timestamp': 1711020600, + 'upload_date': '20240321', + 'release_timestamp': 1711022683, + 'release_date': '20240321', + 'modified_timestamp': 1711031012, + 'modified_date': '20240321', + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/3020025/', + 'info_dict': { + 'id': 'nw_vod_v_en_3020_025_20230325144000_01_1679723944', + 'ext': 'mp4', + 'title': '100 Ideas to Save the World - Working Styles Evolve', + 'description': 'md5:9e6c7778eaaf4f7b4af83569649f84d9', + 'episode': 'Working Styles Evolve', + 'series': '100 Ideas to Save the World', + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 899, + 'upload_date': '20230325', + 'timestamp': 1679755200, + 'release_date': '20230905', + 'release_timestamp': 1693880540, + 'modified_date': '20240206', + 'modified_timestamp': 1707217907, + }, + }, { + # new /shows/audio/ url format + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/livinginjapan-20231001-1/', + 'only_matching': True, + }, { + # valid url even if can't be found in wild; support needed for clip entries extraction + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/9999o80/', + 'only_matching': True, }] def _real_extract(self, url): @@ -273,18 +358,21 @@ def _real_extract(self, url): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P\w+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' + _VALID_URL = rf'''(?x) + {NhkBaseIE._BASE_URL_REGEX}(?:shows|tv)/ + (?:(?Paudio)/programs/)?(?P\w+)/? + (?:\?(?:[^#]+&)?type=(?Pclip|(?:radio|tv)Episode))?''' _TESTS = [{ # video program episodes - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/sumo/', 'info_dict': { 'id': 'sumo', 'title': 'GRAND SUMO Highlights', 'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf', }, - 'playlist_mincount': 0, + 'playlist_mincount': 1, }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', @@ -293,40 +381,68 @@ class NhkVodProgramIE(NhkBaseIE): 'playlist_mincount': 12, }, { # video program clips - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/?type=clip', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, - 'playlist_mincount': 5, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', - 'only_matching': True, + 'playlist_mincount': 12, }, { # audio program - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/programs/livinginjapan/', + 'info_dict': { + 'id': 'livinginjapan', + 'title': 'Living in Japan', + 'description': 'md5:665bb36ec2a12c5a7f598ee713fc2b54', + }, + 'playlist_mincount': 12, + }, { + # /tv/ program url + 'url': 'https://www3.nhk.or.jp/nhkworld/en/tv/designtalksplus/', + 'info_dict': { + 'id': 'designtalksplus', + 'title': 'DESIGN TALKS plus', + 'description': 'md5:47b3b3a9f10d4ac7b33b53b70a7d2837', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/10yearshayaomiyazaki/', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if NhkVodIE.suitable(url) else super().suitable(url) + + def _extract_meta_from_class_elements(self, class_values, html): + for class_value in class_values: + if value := clean_html(get_element_by_class(class_value, html)): + return value + def _real_extract(self, url): lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( - program_id, lang, m_type == 'video', False, episode_type == 'clip') + program_id, lang, m_type != 'audio', False, episode_type == 'clip') - entries = [] - for episode in episodes: - episode_path = episode.get('url') - if not episode_path: - continue - entries.append(self._extract_episode_info( - urljoin(url, episode_path), episode)) + def entries(): + for episode in episodes: + if episode_path := episode.get('url'): + yield self._extract_episode_info(urljoin(url, episode_path), episode) html = self._download_webpage(url, program_id) - program_title = clean_html(get_element_by_class('p-programDetail__title', html)) - program_description = clean_html(get_element_by_class('p-programDetail__text', html)) + program_title = self._extract_meta_from_class_elements([ + 'p-programDetail__title', # /ondemand/program/ + 'pProgramHero__logoText', # /shows/ + 'tAudioProgramMain__title', # /shows/audio/programs/ + 'p-program-name'], html) # /tv/ + program_description = self._extract_meta_from_class_elements([ + 'p-programDetail__text', # /ondemand/program/ + 'pProgramHero__description', # /shows/ + 'tAudioProgramMain__info', # /shows/audio/programs/ + 'p-program-description'], html) # /tv/ - return self.playlist_result(entries, program_id, program_title, program_description) + return self.playlist_result(entries(), program_id, program_title, program_description) class NhkForSchoolBangumiIE(InfoExtractor): From 2e94602f241f6e41bdc48576c61089435529339b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 7 Apr 2024 15:55:46 -0500 Subject: [PATCH 011/124] [ie/jiosaavn] Support playlists (#9622) Closes #9616 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jiosaavn.py | 181 ++++++++++++++++++++++---------- 2 files changed, 124 insertions(+), 58 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2ad5801c44..42034275b9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -876,6 +876,7 @@ from .jiosaavn import ( JioSaavnSongIE, JioSaavnAlbumIE, + JioSaavnPlaylistIE, ) from .jove import JoveIE from .joj import JojIE diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 1131ac0d47..d7f0a2dba8 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,30 +1,90 @@ +import functools + from .common import InfoExtractor from ..utils import ( + format_field, int_or_none, js_to_json, - orderedSet, + make_archive_id, + smuggle_url, + unsmuggle_url, + url_basename, url_or_none, urlencode_postdata, - urljoin, ) from ..utils.traversal import traverse_obj class JioSaavnBaseIE(InfoExtractor): - def _extract_initial_data(self, url, audio_id): - webpage = self._download_webpage(url, audio_id) + _VALID_BITRATES = {'16', '32', '64', '128', '320'} + + @functools.cached_property + def requested_bitrates(self): + requested_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') + if invalid_bitrates := set(requested_bitrates) - self._VALID_BITRATES: + raise ValueError( + f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. ' + + f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') + return requested_bitrates + + def _extract_formats(self, song_data): + for bitrate in self.requested_bitrates: + media_data = self._download_json( + 'https://www.jiosaavn.com/api.php', song_data['id'], + f'Downloading format info for {bitrate}', + fatal=False, data=urlencode_postdata({ + '__call': 'song.generateAuthToken', + '_format': 'json', + 'bitrate': bitrate, + 'url': song_data['encrypted_media_url'], + })) + if not traverse_obj(media_data, ('auth_url', {url_or_none})): + self.report_warning(f'Unable to extract format info for {bitrate}') + continue + ext = media_data.get('type') + yield { + 'url': media_data['auth_url'], + 'ext': 'm4a' if ext == 'mp4' else ext, + 'format_id': bitrate, + 'abr': int(bitrate), + 'vcodec': 'none', + } + + def _extract_song(self, song_data): + info = traverse_obj(song_data, { + 'id': ('id', {str}), + 'title': ('title', 'text', {str}), + 'album': ('album', 'text', {str}), + 'thumbnail': ('image', 0, {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'release_year': ('year', {int_or_none}), + 'artists': ('artists', lambda _, v: v['role'] == 'singer', 'name', {str}), + 'webpage_url': ('perma_url', {url_or_none}), # for song, playlist extraction + }) + if not info.get('webpage_url'): # for album extraction / fallback + info['webpage_url'] = format_field( + song_data, [('title', 'action')], 'https://www.jiosaavn.com%s') or None + if webpage_url := info['webpage_url']: + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, url_basename(webpage_url))] + + return info + + def _extract_initial_data(self, url, display_id): + webpage = self._download_webpage(url, display_id) return self._search_json( r'window\.__INITIAL_DATA__\s*=', webpage, - 'init json', audio_id, transform_source=js_to_json) + 'initial data', display_id, transform_source=js_to_json) class JioSaavnSongIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:song' _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', 'md5': '3b84396d15ed9e083c3106f1fa589c04', 'info_dict': { - 'id': 'OQsEfQFVUXk', + 'id': 'IcoLuefJ', 'ext': 'm4a', 'title': 'Leja Re', 'album': 'Leja Re', @@ -32,62 +92,34 @@ class JioSaavnSongIE(JioSaavnBaseIE): 'duration': 205, 'view_count': int, 'release_year': 2018, - 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi', 'Rashmi Virag', 'Irshad Kamil'], + 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], + '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], }, }, { 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', 'only_matching': True, }] - _VALID_BITRATES = ('16', '32', '64', '128', '320') - def _real_extract(self, url): - audio_id = self._match_id(url) - extract_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') - if invalid_bitrates := [br for br in extract_bitrates if br not in self._VALID_BITRATES]: - raise ValueError( - f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. ' - + f'Valid bitrates are: {", ".join(self._VALID_BITRATES)}') + url, smuggled_data = unsmuggle_url(url) + song_data = traverse_obj(smuggled_data, ({ + 'id': ('id', {str}), + 'encrypted_media_url': ('encrypted_media_url', {str}), + })) - song_data = self._extract_initial_data(url, audio_id)['song']['song'] - formats = [] - for bitrate in extract_bitrates: - media_data = self._download_json( - 'https://www.jiosaavn.com/api.php', audio_id, f'Downloading format info for {bitrate}', - fatal=False, data=urlencode_postdata({ - '__call': 'song.generateAuthToken', - '_format': 'json', - 'bitrate': bitrate, - 'url': song_data['encrypted_media_url'], - })) - if not media_data.get('auth_url'): - self.report_warning(f'Unable to extract format info for {bitrate}') - continue - ext = media_data.get('type') - formats.append({ - 'url': media_data['auth_url'], - 'ext': 'm4a' if ext == 'mp4' else ext, - 'format_id': bitrate, - 'abr': int(bitrate), - 'vcodec': 'none', - }) + if 'id' in song_data and 'encrypted_media_url' in song_data: + result = {'id': song_data['id']} + else: + # only extract metadata if this is not a url_transparent result + song_data = self._extract_initial_data(url, self._match_id(url))['song']['song'] + result = self._extract_song(song_data) - return { - 'id': audio_id, - 'formats': formats, - **traverse_obj(song_data, { - 'title': ('title', 'text'), - 'album': ('album', 'text'), - 'thumbnail': ('image', 0, {url_or_none}), - 'duration': ('duration', {int_or_none}), - 'view_count': ('play_count', {int_or_none}), - 'release_year': ('year', {int_or_none}), - 'artists': ('artists', ..., 'name', {str}, all, {orderedSet}), - }), - } + result['formats'] = list(self._extract_formats(song_data)) + return result class JioSaavnAlbumIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:album' _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', @@ -98,12 +130,45 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): 'playlist_count': 10, }] - def _real_extract(self, url): - album_id = self._match_id(url) - album_view = self._extract_initial_data(url, album_id)['albumView'] + def _entries(self, playlist_data): + for song_data in traverse_obj(playlist_data, ( + 'modules', lambda _, x: x['key'] == 'list', 'data', lambda _, v: v['title']['action'])): + song_info = self._extract_song(song_data) + # album song data is missing artists and release_year, need to re-extract metadata + yield self.url_result(song_info['webpage_url'], JioSaavnSongIE, **song_info) - return self.playlist_from_matches( - traverse_obj(album_view, ( - 'modules', lambda _, x: x['key'] == 'list', 'data', ..., 'title', 'action', {str})), - album_id, traverse_obj(album_view, ('album', 'title', 'text', {str})), ie=JioSaavnSongIE, - getter=lambda x: urljoin('https://www.jiosaavn.com/', x)) + def _real_extract(self, url): + display_id = self._match_id(url) + album_data = self._extract_initial_data(url, display_id)['albumView'] + + return self.playlist_result( + self._entries(album_data), display_id, traverse_obj(album_data, ('album', 'title', 'text', {str}))) + + +class JioSaavnPlaylistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:playlist' + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', + 'info_dict': { + 'id': 'LlJ8ZWT1ibN5084vKHRj2Q__', + 'title': 'Mood English', + }, + 'playlist_mincount': 50, + }] + + def _entries(self, playlist_data): + for song_data in traverse_obj(playlist_data, ('list', lambda _, v: v['perma_url'])): + song_info = self._extract_song(song_data) + url = smuggle_url(song_info['webpage_url'], { + 'id': song_data['id'], + 'encrypted_media_url': song_data['encrypted_media_url'], + }) + yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + + def _real_extract(self, url): + display_id = self._match_id(url) + playlist_data = self._extract_initial_data(url, display_id)['playlist']['playlist'] + + return self.playlist_result( + self._entries(playlist_data), display_id, traverse_obj(playlist_data, ('title', 'text', {str}))) From df0e138fc02ae2764a44f2f59fc93c756c4d3ee2 Mon Sep 17 00:00:00 2001 From: Leo Heitmann Ruiz Date: Mon, 8 Apr 2024 21:18:04 +0200 Subject: [PATCH 012/124] [docs] Various manpage fixes Authored by: leoheitmannruiz --- README.md | 2 ++ devscripts/prepare_manpage.py | 27 +++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ee1b599900..fde5453f82 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,7 @@ ## UPDATE You may also use `--update-to ` (`/`) to update to a channel on a completely different repository. Be careful with what repository you are updating to though, there is no verification done for binaries from different repositories. Example usage: + * `yt-dlp --update-to master` switch to the `master` channel and update to its latest release * `yt-dlp --update-to stable@2023.07.06` upgrade/downgrade to release to `stable` channel tag `2023.07.06` * `yt-dlp --update-to 2023.10.07` upgrade/downgrade to tag `2023.10.07` if it exists on the current channel @@ -1892,6 +1893,7 @@ ## Installing Plugins `.zip`, `.egg` and `.whl` archives containing a `yt_dlp_plugins` namespace folder in their root are also supported as plugin packages. + * e.g. `${XDG_CONFIG_HOME}/yt-dlp/plugins/mypluginpkg.zip` where `mypluginpkg.zip` contains `yt_dlp_plugins//myplugin.py` Run yt-dlp with `--verbose` to check if the plugin has been loaded. diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 009e7bba10..47188e9923 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -43,6 +43,27 @@ def filter_excluded_sections(readme): '', readme) +def _convert_code_blocks(readme): + current_code_block = None + + for line in readme.splitlines(True): + if current_code_block: + if line == current_code_block: + current_code_block = None + yield '\n' + else: + yield f' {line}' + elif line.startswith('```'): + current_code_block = line.count('`') * '`' + '\n' + yield '\n' + else: + yield line + + +def convert_code_blocks(readme): + return ''.join(_convert_code_blocks(readme)) + + def move_sections(readme): MOVE_TAG_TEMPLATE = '' sections = re.findall(r'(?m)^%s$' % ( @@ -65,8 +86,10 @@ def move_sections(readme): def filter_options(readme): section = re.search(r'(?sm)^# USAGE AND OPTIONS\n.+?(?=^# )', readme).group(0) + section_new = section.replace('*', R'\*') + options = '# OPTIONS\n' - for line in section.split('\n')[1:]: + for line in section_new.split('\n')[1:]: mobj = re.fullmatch(r'''(?x) \s{4}(?P-(?:,\s|[^\s])+) (?:\s(?P(?:[^\s]|\s(?!\s))+))? @@ -86,7 +109,7 @@ def filter_options(readme): return readme.replace(section, options, 1) -TRANSFORM = compose_functions(filter_excluded_sections, move_sections, filter_options) +TRANSFORM = compose_functions(filter_excluded_sections, convert_code_blocks, move_sections, filter_options) def main(): From 79a451e5763eda8b10d00684d5d3378f3255ee01 Mon Sep 17 00:00:00 2001 From: luiso1979 Date: Mon, 8 Apr 2024 21:53:30 +0200 Subject: [PATCH 013/124] [networking] Respect `SSLKEYLOGFILE` environment variable (#9543) Authored by: luiso1979 --- yt_dlp/networking/_helper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index d79dd79530..ecaff36e73 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -2,6 +2,7 @@ import contextlib import functools +import os import socket import ssl import sys @@ -121,6 +122,9 @@ def make_ssl_context( context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) context.check_hostname = verify context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE + # OpenSSL 1.1.1+ Python 3.8+ keylog file + if hasattr(context, 'keylog_filename'): + context.keylog_filename = os.environ.get('SSLKEYLOGFILE') # Some servers may reject requests if ALPN extension is not sent. See: # https://github.com/python/cpython/issues/85140 From 9590cc6b4768e190183d7d071a6c78170889116a Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 8 Apr 2024 22:47:38 +0200 Subject: [PATCH 014/124] Add new option `--progress-delta` (#9082) Authored by: Grub4K --- README.md | 1 + yt_dlp/YoutubeDL.py | 2 +- yt_dlp/__init__.py | 1 + yt_dlp/downloader/common.py | 11 +++++++++++ yt_dlp/options.py | 4 ++++ 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fde5453f82..bc4eba6606 100644 --- a/README.md +++ b/README.md @@ -758,6 +758,7 @@ ## Verbosity and Simulation Options: accessible under "progress" key. E.g. --console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s" + --progress-delta SECONDS Time between progress output (default: 0) -v, --verbose Print various debugging information --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 291fc8d00c..35aba968fb 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -481,7 +481,7 @@ class YoutubeDL: nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size, - external_downloader_args, concurrent_fragment_downloads. + external_downloader_args, concurrent_fragment_downloads, progress_delta. The following options are used by the post processors: ffmpeg_location: Location of the ffmpeg/avconv binary; either the path diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 940594fafb..3d606bcba2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -836,6 +836,7 @@ def parse_options(argv=None): 'noprogress': opts.quiet if opts.noprogress is None else opts.noprogress, 'progress_with_newline': opts.progress_with_newline, 'progress_template': opts.progress_template, + 'progress_delta': opts.progress_delta, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index b71d7ee8f2..65a0d6f234 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -4,6 +4,7 @@ import os import random import re +import threading import time from ..minicurses import ( @@ -63,6 +64,7 @@ class FileDownloader: min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. + progress_delta: The minimum time between progress output, in seconds external_downloader_args: A dictionary of downloader keys (in lower case) and a list of additional command-line arguments for the executable. Use 'default' as the name for arguments to be @@ -88,6 +90,9 @@ def __init__(self, ydl, params): self.params = params self._prepare_multiline_status() self.add_progress_hook(self.report_progress) + if self.params.get('progress_delta'): + self._progress_delta_lock = threading.Lock() + self._progress_delta_time = time.monotonic() def _set_ydl(self, ydl): self.ydl = ydl @@ -366,6 +371,12 @@ def with_fields(*tups, default=''): if s['status'] != 'downloading': return + if update_delta := self.params.get('progress_delta'): + with self._progress_delta_lock: + if time.monotonic() < self._progress_delta_time: + return + self._progress_delta_time += update_delta + s.update({ '_eta_str': self.format_eta(s.get('eta')).strip(), '_speed_str': self.format_speed(s.get('speed')), diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 43d71ef070..faa1ee5634 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1258,6 +1258,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'the progress attributes are accessible under "progress" key. E.g. ' # TODO: Document the fields inside "progress" '--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"')) + verbosity.add_option( + '--progress-delta', + metavar='SECONDS', action='store', dest='progress_delta', type=float, default=0, + help='Time between progress output (default: 0)') verbosity.add_option( '-v', '--verbose', action='store_true', dest='verbose', default=False, From b19ae095fdddd43c2a2c67d10fbe0d9a645bb98f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 8 Apr 2024 18:20:58 -0500 Subject: [PATCH 015/124] [build] Do not include `curl_cffi` in `macos_legacy` (#9653) Authored by: bashonly --- .github/workflows/build.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5285923e71..04536e22c3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -320,7 +320,7 @@ jobs: run: | brew install coreutils python3 devscripts/install_deps.py --user -o --include build - python3 devscripts/install_deps.py --user --include pyinstaller --include curl_cffi + python3 devscripts/install_deps.py --user --include pyinstaller - name: Prepare run: | diff --git a/README.md b/README.md index bc4eba6606..458541d68e 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ #### Impersonation * [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) * Can be installed with the `curl_cffi` group, e.g. `pip install yt-dlp[default,curl_cffi]` - * Only included in `yt-dlp.exe`, `yt-dlp_macos` and `yt-dlp_macos_legacy` builds + * Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds ### Metadata From 216f6a3cb57824e6a3c859649ce058c199b1b247 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 9 Apr 2024 11:12:26 -0500 Subject: [PATCH 016/124] [cleanup] Misc (#9426) Authored by: bashonly, pukkandan --- .github/workflows/quick-test.yml | 2 ++ Makefile | 7 +++-- devscripts/changelog_override.json | 12 ++++++++ test/test_traversal.py | 44 ++++++++++++++++-------------- yt_dlp/networking/_helper.py | 2 +- yt_dlp/update.py | 2 +- 6 files changed, 44 insertions(+), 25 deletions(-) diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 3114e7bdd6..24b34911f3 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -27,6 +27,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 + with: + python-version: '3.8' - name: Install flake8 run: python3 ./devscripts/install_deps.py -o --include dev - name: Make lazy extractors diff --git a/Makefile b/Makefile index 38c6b4f2dc..cef4bc6cb1 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,12 @@ tar: yt-dlp.tar.gz # intended use: when building a source distribution, # make pypi-files && python3 -m build -sn . pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites \ - completions yt-dlp.1 pyproject.toml setup.cfg devscripts/* test/* + completions yt-dlp.1 pyproject.toml setup.cfg devscripts/* test/* -.PHONY: all clean install test tar pypi-files completions ot offlinetest codetest supportedsites +.PHONY: all clean clean-all clean-test clean-dist clean-cache \ + completions completion-bash completion-fish completion-zsh \ + doc issuetemplates supportedsites ot offlinetest codetest test \ + tar pypi-files lazy-extractors install uninstall clean-test: rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index eaa348cf2e..52ddf0613d 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -127,8 +127,20 @@ "short": "[ie] Support multi-period MPD streams (#6654)", "authors": ["alard", "pukkandan"] }, + { + "action": "change", + "when": "aa7e9ae4f48276bd5d0173966c77db9484f65a0a", + "short": "[ie/xvideos] Support new URL format (#9502)", + "authors": ["sta1us"] + }, { "action": "remove", "when": "22e4dfacb61f62dfbb3eb41b31c7b69ba1059b80" + }, + { + "action": "change", + "when": "e3a3ed8a981d9395c4859b6ef56cd02bc3148db2", + "short": "[cleanup:ie] No `from` stdlib imports in extractors", + "authors": ["pukkandan"] } ] diff --git a/test/test_traversal.py b/test/test_traversal.py index ed29d03ad5..9b2a27b080 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -26,27 +26,6 @@ class TestTraversal: - def test_dict_get(self): - FALSE_VALUES = { - 'none': None, - 'false': False, - 'zero': 0, - 'empty_string': '', - 'empty_list': [], - } - d = {**FALSE_VALUES, 'a': 42} - assert dict_get(d, 'a') == 42 - assert dict_get(d, 'b') is None - assert dict_get(d, 'b', 42) == 42 - assert dict_get(d, ('a',)) == 42 - assert dict_get(d, ('b', 'a')) == 42 - assert dict_get(d, ('b', 'c', 'a', 'd')) == 42 - assert dict_get(d, ('b', 'c')) is None - assert dict_get(d, ('b', 'c'), 42) == 42 - for key, false_value in FALSE_VALUES.items(): - assert dict_get(d, ('b', 'c', key)) is None - assert dict_get(d, ('b', 'c', key), skip_false_values=False) == false_value - def test_traversal_base(self): assert traverse_obj(_TEST_DATA, ('str',)) == 'str', \ 'allow tuple path' @@ -440,3 +419,26 @@ def test_traversal_morsel(self): 'function key should yield all values' assert traverse_obj(morsel, [(None,), any]) == morsel, \ 'Morsel should not be implicitly changed to dict on usage' + + +class TestDictGet: + def test_dict_get(self): + FALSE_VALUES = { + 'none': None, + 'false': False, + 'zero': 0, + 'empty_string': '', + 'empty_list': [], + } + d = {**FALSE_VALUES, 'a': 42} + assert dict_get(d, 'a') == 42 + assert dict_get(d, 'b') is None + assert dict_get(d, 'b', 42) == 42 + assert dict_get(d, ('a',)) == 42 + assert dict_get(d, ('b', 'a')) == 42 + assert dict_get(d, ('b', 'c', 'a', 'd')) == 42 + assert dict_get(d, ('b', 'c')) is None + assert dict_get(d, ('b', 'c'), 42) == 42 + for key, false_value in FALSE_VALUES.items(): + assert dict_get(d, ('b', 'c', key)) is None + assert dict_get(d, ('b', 'c', key), skip_false_values=False) == false_value diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index ecaff36e73..8e678b26ab 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -124,7 +124,7 @@ def make_ssl_context( context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE # OpenSSL 1.1.1+ Python 3.8+ keylog file if hasattr(context, 'keylog_filename'): - context.keylog_filename = os.environ.get('SSLKEYLOGFILE') + context.keylog_filename = os.environ.get('SSLKEYLOGFILE') or None # Some servers may reject requests if ALPN extension is not sent. See: # https://github.com/python/cpython/issues/85140 diff --git a/yt_dlp/update.py b/yt_dlp/update.py index db50cfa6b4..f47cbc5b29 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -114,7 +114,7 @@ def current_git_head(): **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, 'source': 'You cannot update when running from source code; Use git to pull the latest changes', - 'unknown': 'You installed yt-dlp with a package manager or setup.py; Use that to update', + 'unknown': 'You installed yt-dlp from a manual build or with a package manager; Use that to update', 'other': 'You are using an unofficial build of yt-dlp; Build the executable again', } From ff07792676f404ffff6ee61b5638c9dc1a33a37a Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 8 Apr 2024 23:18:04 +0200 Subject: [PATCH 017/124] [core] Prevent RCE when using `--exec` with `%q` (CVE-2024-22423) The shell escape function now properly escapes `%`, `\\` and `\n`. `utils.Popen` as well as `%q` output template expansion have been patched accordingly. Prior to this fix using `--exec` together with `%q` when on Windows could cause remote code to execute. See https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p for more details. Authored by: Grub4K --- devscripts/changelog_override.json | 5 +++ test/test_utils.py | 4 +++ yt_dlp/YoutubeDL.py | 8 ++--- yt_dlp/compat/__init__.py | 9 ++---- yt_dlp/utils/_utils.py | 50 ++++++++++++++++++++++-------- 5 files changed, 53 insertions(+), 23 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 52ddf0613d..046060cb25 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -142,5 +142,10 @@ "when": "e3a3ed8a981d9395c4859b6ef56cd02bc3148db2", "short": "[cleanup:ie] No `from` stdlib imports in extractors", "authors": ["pukkandan"] + }, + { + "action": "add", + "when": "9590cc6b4768e190183d7d071a6c78170889116a", + "short": "[priority] Security: [[CVE-2024-22423](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-22423)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p)\n - The shell escape function now properly escapes `%`, `\\` and `\\n`.\n - `utils.Popen` has been patched accordingly." } ] diff --git a/test/test_utils.py b/test/test_utils.py index 71febeefd6..ddf0a7c242 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2069,6 +2069,10 @@ def run_shell(args): # Test escaping assert run_shell(['echo', 'test"&']) == '"test""&"\n' + assert run_shell(['echo', '%CMDCMDLINE:~-1%&']) == '"%CMDCMDLINE:~-1%&"\n' + assert run_shell(['echo', 'a\nb']) == '"a"\n"b"\n' + assert run_shell(['echo', '"']) == '""""\n' + assert run_shell(['echo', '\\']) == '\\\n' # Test if delayed expansion is disabled assert run_shell(['echo', '^!']) == '"^!"\n' assert run_shell('echo "^!"') == '"^!"\n' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 35aba968fb..9f730d0384 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -25,7 +25,7 @@ from .cache import Cache from .compat import functools, urllib # isort: split -from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req +from .compat import compat_os_name, urllib_req_to_req from .cookies import LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version @@ -102,7 +102,6 @@ UserNotLive, YoutubeDLError, age_restricted, - args_to_str, bug_reports_message, date_from_str, deprecation_warning, @@ -141,6 +140,7 @@ sanitize_filename, sanitize_path, sanitize_url, + shell_quote, str_or_none, strftime_or_none, subtitles_filename, @@ -823,7 +823,7 @@ def warn_if_short_id(self, argv): self.report_warning( 'Long argument string detected. ' 'Use -- to separate parameters and URLs, like this:\n%s' % - args_to_str(correct_argv)) + shell_quote(correct_argv)) def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" @@ -1355,7 +1355,7 @@ def create_key(outer_mobj): value, fmt = escapeHTML(str(value)), str_fmt elif fmt[-1] == 'q': # quoted value = map(str, variadic(value) if '#' in flags else [value]) - value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt + value, fmt = shell_quote(value, shell=True), str_fmt elif fmt[-1] == 'B': # bytes value = f'%{str_fmt}'.encode() % str(value).encode() value, fmt = value.decode('utf-8', 'ignore'), 's' diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index 5ad5c70ecf..d820adaf1e 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -27,12 +27,9 @@ def compat_etree_fromstring(text): compat_os_name = os._name if os.name == 'java' else os.name -if compat_os_name == 'nt': - def compat_shlex_quote(s): - import re - return s if re.match(r'^[-_\w./]+$', s) else s.replace('"', '""').join('""') -else: - from shlex import quote as compat_shlex_quote # noqa: F401 +def compat_shlex_quote(s): + from ..utils import shell_quote + return shell_quote(s) def compat_ord(c): diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index dec514674f..e3e80f3d33 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -50,7 +50,6 @@ compat_expanduser, compat_HTMLParseError, compat_os_name, - compat_shlex_quote, ) from ..dependencies import xattr @@ -836,9 +835,11 @@ def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs if shell and compat_os_name == 'nt' and kwargs.get('executable') is None: if not isinstance(args, str): - args = ' '.join(compat_shlex_quote(a) for a in args) + args = shell_quote(args, shell=True) shell = False - args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"' + # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`) + env['='] = '"^\n\n"' + args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"' super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo) @@ -1637,15 +1638,38 @@ def get_filesystem_encoding(): return encoding if encoding is not None else 'utf-8' -def shell_quote(args): - quoted_args = [] - encoding = get_filesystem_encoding() - for a in args: - if isinstance(a, bytes): - # We may get a filename encoded with 'encodeFilename' - a = a.decode(encoding) - quoted_args.append(compat_shlex_quote(a)) - return ' '.join(quoted_args) +_WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'}) +_CMD_QUOTE_TRANS = str.maketrans({ + # Keep quotes balanced by replacing them with `""` instead of `\\"` + '"': '""', + # Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`) + # `=` should be unique since variables containing `=` cannot be set using cmd + '\n': '%=%', + # While we are only required to escape backslashes immediately before quotes, + # we instead escape all of 'em anyways to be consistent + '\\': '\\\\', + # Use zero length variable replacement so `%` doesn't get expanded + # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`) + '%': '%%cd:~,%', +}) + + +def shell_quote(args, *, shell=False): + args = list(variadic(args)) + if any(isinstance(item, bytes) for item in args): + deprecation_warning('Passing bytes to utils.shell_quote is deprecated') + encoding = get_filesystem_encoding() + for index, item in enumerate(args): + if isinstance(item, bytes): + args[index] = item.decode(encoding) + + if compat_os_name != 'nt': + return shlex.join(args) + + trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS + return ' '.join( + s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""') + for s in args) def smuggle_url(url, data): @@ -2849,7 +2873,7 @@ def ytdl_is_updateable(): def args_to_str(args): # Get a short string representation for a subprocess command - return ' '.join(compat_shlex_quote(a) for a in args) + return shell_quote(args) def error_to_str(err): From 168e72dcd3e04e0e19e92c012a04b8a1e4658f50 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:03:28 +0000 Subject: [PATCH 018/124] Release 2024.04.09 Created by: Grub4K :ci skip all :ci run dl --- CONTRIBUTORS | 10 +++++ Changelog.md | 95 +++++++++++++++++++++++++++++++++++++++++++++++ supportedsites.md | 13 +++++-- yt_dlp/version.py | 6 +-- 4 files changed, 118 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6ee3baa3d0..8b5d19a64f 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -600,3 +600,13 @@ xpadev-net Xpl0itU YoshichikaAAA zhijinwuu +alb +hruzgar +kasper93 +leoheitmannruiz +luiso1979 +nipotan +Offert4324 +sta1us +Tomoka1 +trwstin diff --git a/Changelog.md b/Changelog.md index 45a9cef3fa..6cf08beab4 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,101 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.04.09 + +#### Important changes +- Security: [[CVE-2024-22423](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-22423)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p) + - The shell escape function now properly escapes `%`, `\` and `\n`. + - `utils.Popen` has been patched accordingly. + +#### Core changes +- [Add new option `--progress-delta`](https://github.com/yt-dlp/yt-dlp/commit/9590cc6b4768e190183d7d071a6c78170889116a) ([#9082](https://github.com/yt-dlp/yt-dlp/issues/9082)) by [Grub4K](https://github.com/Grub4K) +- [Add new options `--impersonate` and `--list-impersonate-targets`](https://github.com/yt-dlp/yt-dlp/commit/0b81d4d252bd065ccd352722987ea34fe17f9244) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- [Add option `--no-break-on-existing`](https://github.com/yt-dlp/yt-dlp/commit/16be117729150b2784f3b17755c886cb0cf73374) ([#9610](https://github.com/yt-dlp/yt-dlp/issues/9610)) by [bashonly](https://github.com/bashonly) +- [Fix `filesize_approx` calculation](https://github.com/yt-dlp/yt-dlp/commit/86e3b82261e8ebc6c6707c09544c9dfb8907c0fd) ([#9560](https://github.com/yt-dlp/yt-dlp/issues/9560)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- [Infer `acodec` for single-codec containers](https://github.com/yt-dlp/yt-dlp/commit/86a972033e05fea80e5fe7f2aff6723dbe2f3952) by [pukkandan](https://github.com/pukkandan) +- [Prevent RCE when using `--exec` with `%q` (CVE-2024-22423)](https://github.com/yt-dlp/yt-dlp/commit/ff07792676f404ffff6ee61b5638c9dc1a33a37a) by [Grub4K](https://github.com/Grub4K) +- **cookies**: [Add `--cookies-from-browser` support for Firefox Flatpak](https://github.com/yt-dlp/yt-dlp/commit/2ab2651a4a7be18939e2b4cb21be79fe477c797a) ([#9619](https://github.com/yt-dlp/yt-dlp/issues/9619)) by [un-def](https://github.com/un-def) +- **utils** + - `traverse_obj` + - [Allow unbranching using `all` and `any`](https://github.com/yt-dlp/yt-dlp/commit/3699eeb67cad333272b14a42dd3843d93fda1a2e) ([#9571](https://github.com/yt-dlp/yt-dlp/issues/9571)) by [Grub4K](https://github.com/Grub4K) + - [Convenience improvements](https://github.com/yt-dlp/yt-dlp/commit/32abfb00bdbd119ca675fdc6d1719331f0a2741a) ([#9577](https://github.com/yt-dlp/yt-dlp/issues/9577)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Add extractor impersonate API](https://github.com/yt-dlp/yt-dlp/commit/50c29352312f5662acf9a64b0012766f5c40af61) ([#9474](https://github.com/yt-dlp/yt-dlp/issues/9474)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- **afreecatv** + - [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/9415f1a5ef88482ebafe3083e8bcb778ac512df7) ([#9566](https://github.com/yt-dlp/yt-dlp/issues/9566)) by [bashonly](https://github.com/bashonly), [Tomoka1](https://github.com/Tomoka1) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9073ae6458f4c6a832aa832c67174c61852869be) ([#9348](https://github.com/yt-dlp/yt-dlp/issues/9348)) by [hui1601](https://github.com/hui1601) +- **asobistage**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/0284f1fee202302a78888420f933deae19d9f4e1) ([#8735](https://github.com/yt-dlp/yt-dlp/issues/8735)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **box**: [Support URLs without file IDs](https://github.com/yt-dlp/yt-dlp/commit/07f5b2f7570fd9ac85aed17f4c0118f6eac77beb) ([#9504](https://github.com/yt-dlp/yt-dlp/issues/9504)) by [shreyasminocha](https://github.com/shreyasminocha) +- **cbc.ca**: player: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/b49d5ffc53a72d8245ba319ff07bdc5b8c6a4f0c) ([#9561](https://github.com/yt-dlp/yt-dlp/issues/9561)) by [trainman261](https://github.com/trainman261) +- **crunchyroll** + - [Extract `vo_adaptive_hls` formats by default](https://github.com/yt-dlp/yt-dlp/commit/be77923ffe842f667971019460f6005f3cad01eb) ([#9447](https://github.com/yt-dlp/yt-dlp/issues/9447)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/954e57e405f79188450eb30103a9308732cd318f) ([#9615](https://github.com/yt-dlp/yt-dlp/issues/9615)) by [bytedream](https://github.com/bytedream) +- **dropbox**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/a48cc86d6f6b20427553620c2ddb990ede6a4b41) ([#9627](https://github.com/yt-dlp/yt-dlp/issues/9627)) by [bashonly](https://github.com/bashonly) +- **fathom**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bc2b8c0596fd6b75af24822c4f0f1da6783d71f7) ([#9495](https://github.com/yt-dlp/yt-dlp/issues/9495)) by [src-tinkerer](https://github.com/src-tinkerer) +- **gofile**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0da66980d3193cad3dae0120cddddbfcabddf7a1) ([#9446](https://github.com/yt-dlp/yt-dlp/issues/9446)) by [jazz1611](https://github.com/jazz1611) +- **imgur**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/86d2f4d24849af0d1f3af7c0e2ac43bf8a058f74) ([#9471](https://github.com/yt-dlp/yt-dlp/issues/9471)) by [trwstin](https://github.com/trwstin) +- **jiosaavn** + - [Extract artists](https://github.com/yt-dlp/yt-dlp/commit/0ae16ceb1846cc4e609b70ce7c5d8e7458efceb2) ([#9612](https://github.com/yt-dlp/yt-dlp/issues/9612)) by [bashonly](https://github.com/bashonly) + - [Fix format extensions](https://github.com/yt-dlp/yt-dlp/commit/443e206ec41e64ca2aef61d8ef91640fb69b3113) ([#9609](https://github.com/yt-dlp/yt-dlp/issues/9609)) by [bashonly](https://github.com/bashonly) + - [Support playlists](https://github.com/yt-dlp/yt-dlp/commit/2e94602f241f6e41bdc48576c61089435529339b) ([#9622](https://github.com/yt-dlp/yt-dlp/issues/9622)) by [bashonly](https://github.com/bashonly) +- **joqrag**: [Fix live status detection](https://github.com/yt-dlp/yt-dlp/commit/f2fd449b46c4058222e1744f7a35caa20b2d003d) ([#9624](https://github.com/yt-dlp/yt-dlp/issues/9624)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **kick**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/c8a61a910096c77ce08dad5e1b2fbda5eb964156) ([#9611](https://github.com/yt-dlp/yt-dlp/issues/9611)) by [bashonly](https://github.com/bashonly) +- **loom**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/f859ed3ba1e8b129ae6a467592c65687e73fbca1) ([#8686](https://github.com/yt-dlp/yt-dlp/issues/8686)) by [bashonly](https://github.com/bashonly), [hruzgar](https://github.com/hruzgar) +- **medici**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4cd9e251b9abada107b10830de997bf4d79ca369) ([#9518](https://github.com/yt-dlp/yt-dlp/issues/9518)) by [Offert4324](https://github.com/Offert4324) +- **mixch** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4c3b7a0769706f7f0ea24adf1f219d5ae82d2b07) ([#9608](https://github.com/yt-dlp/yt-dlp/issues/9608)) by [bashonly](https://github.com/bashonly), [nipotan](https://github.com/nipotan) + - archive: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c59de48e2bb4c681b03b93b584a05f52609ce4a0) ([#8761](https://github.com/yt-dlp/yt-dlp/issues/8761)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nhk**: [Fix NHK World extractors](https://github.com/yt-dlp/yt-dlp/commit/4af9d5c2f6aa81403ae2a8a5ae3cc824730f0b86) ([#9623](https://github.com/yt-dlp/yt-dlp/issues/9623)) by [bashonly](https://github.com/bashonly) +- **patreon**: [Do not extract dead embed URLs](https://github.com/yt-dlp/yt-dlp/commit/36b240f9a72af57eb2c9d927ebb7fd1c917ebf18) ([#9613](https://github.com/yt-dlp/yt-dlp/issues/9613)) by [johnvictorfs](https://github.com/johnvictorfs) +- **radio1be**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36baaa10e06715ccba06b78885b2042c4844c826) ([#9122](https://github.com/yt-dlp/yt-dlp/issues/9122)) by [HobbyistDev](https://github.com/HobbyistDev) +- **sharepoint**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ff349ff94aae0b2b148bd3670f7c91d39c2f1d8e) ([#6531](https://github.com/yt-dlp/yt-dlp/issues/6531)) by [bashonly](https://github.com/bashonly), [C0D3D3V](https://github.com/C0D3D3V) +- **sonylivseries**: [Fix season extraction](https://github.com/yt-dlp/yt-dlp/commit/f2868b26e917354203f82a370ad2396646edb813) ([#9423](https://github.com/yt-dlp/yt-dlp/issues/9423)) by [bashonly](https://github.com/bashonly) +- **soundcloud** + - [Adjust format sorting](https://github.com/yt-dlp/yt-dlp/commit/a2d0840739cddd585d24e0ce4796394fc8a4fa2e) ([#9584](https://github.com/yt-dlp/yt-dlp/issues/9584)) by [bashonly](https://github.com/bashonly) + - [Support cookies](https://github.com/yt-dlp/yt-dlp/commit/97362712a1f2b04e735bdf54f749ad99165a62fe) ([#9586](https://github.com/yt-dlp/yt-dlp/issues/9586)) by [bashonly](https://github.com/bashonly) + - [Support retries for API rate-limit](https://github.com/yt-dlp/yt-dlp/commit/246571ae1d867df8bf31a056bdf3bbbfd398366a) ([#9585](https://github.com/yt-dlp/yt-dlp/issues/9585)) by [bashonly](https://github.com/bashonly) +- **thisoldhouse**: [Support Brightcove embeds](https://github.com/yt-dlp/yt-dlp/commit/0df63cce69026d2f4c0cbb4dd36163e83eac93dc) ([#9576](https://github.com/yt-dlp/yt-dlp/issues/9576)) by [bashonly](https://github.com/bashonly) +- **tiktok** + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/cb61e20c266facabb7a30f9ce53bd79dfc158475) ([#9548](https://github.com/yt-dlp/yt-dlp/issues/9548)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Prefer non-bytevc2 formats](https://github.com/yt-dlp/yt-dlp/commit/63f685f341f35f6f02b0368d1ba53bdb5b520410) ([#9575](https://github.com/yt-dlp/yt-dlp/issues/9575)) by [bashonly](https://github.com/bashonly) + - [Restore `carrier_region` API parameter](https://github.com/yt-dlp/yt-dlp/commit/fc53ec13ff1ee926a3e533a68cfca8acc887b661) ([#9637](https://github.com/yt-dlp/yt-dlp/issues/9637)) by [bashonly](https://github.com/bashonly) + - [Update API hostname](https://github.com/yt-dlp/yt-dlp/commit/8c05b3ebae23c5b444857549a85b84004c01a536) ([#9444](https://github.com/yt-dlp/yt-dlp/issues/9444)) by [bashonly](https://github.com/bashonly) +- **twitch**: [Extract AV1 and HEVC formats](https://github.com/yt-dlp/yt-dlp/commit/02f93ff51b3ff9436d60c4993562b366eaae8851) ([#9158](https://github.com/yt-dlp/yt-dlp/issues/9158)) by [kasper93](https://github.com/kasper93) +- **vkplay**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/b15b0c1d2106437ec61a5c436c543e8760eac160) ([#9636](https://github.com/yt-dlp/yt-dlp/issues/9636)) by [bashonly](https://github.com/bashonly) +- **xvideos**: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/aa7e9ae4f48276bd5d0173966c77db9484f65a0a) ([#9502](https://github.com/yt-dlp/yt-dlp/issues/9502)) by [sta1us](https://github.com/sta1us) +- **youtube** + - [Calculate more accurate `filesize`](https://github.com/yt-dlp/yt-dlp/commit/a25a424323267e3f6f9f63c0b62df499bd7b8d46) by [pukkandan](https://github.com/pukkandan) + - [Update `android` params](https://github.com/yt-dlp/yt-dlp/commit/e7b17fce14775bd2448695c8eb7379b8d31d3537) by [pukkandan](https://github.com/pukkandan) + - search: [Fix params for uncensored results](https://github.com/yt-dlp/yt-dlp/commit/17d248a58781e2588d18a5ebe00c441d10011fcd) ([#9456](https://github.com/yt-dlp/yt-dlp/issues/9456)) by [alb](https://github.com/alb), [pukkandan](https://github.com/pukkandan) + +#### Downloader changes +- **ffmpeg**: [Accept output args from info dict](https://github.com/yt-dlp/yt-dlp/commit/9c42b7eef547e826e9fcc7beb6706a2523949d05) ([#9278](https://github.com/yt-dlp/yt-dlp/issues/9278)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- [Respect `SSLKEYLOGFILE` environment variable](https://github.com/yt-dlp/yt-dlp/commit/79a451e5763eda8b10d00684d5d3378f3255ee01) ([#9543](https://github.com/yt-dlp/yt-dlp/issues/9543)) by [luiso1979](https://github.com/luiso1979) +- **Request Handler** + - curlcffi: [Add support for `curl_cffi`](https://github.com/yt-dlp/yt-dlp/commit/52f5be1f1e0dc45bb397ab950f564721976a39bf) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - websockets: [Workaround race condition causing issues on PyPy](https://github.com/yt-dlp/yt-dlp/commit/e5d4f11104ce7ea1717a90eea82c0f7d230ea5d5) ([#9514](https://github.com/yt-dlp/yt-dlp/issues/9514)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Do not include `curl_cffi` in `macos_legacy`](https://github.com/yt-dlp/yt-dlp/commit/b19ae095fdddd43c2a2c67d10fbe0d9a645bb98f) ([#9653](https://github.com/yt-dlp/yt-dlp/issues/9653)) by [bashonly](https://github.com/bashonly) + - [Optional dependencies cleanup](https://github.com/yt-dlp/yt-dlp/commit/58dd0f8d1eee6bc9fdc57f1923bed772fa3c946d) ([#9550](https://github.com/yt-dlp/yt-dlp/issues/9550)) by [bashonly](https://github.com/bashonly) + - [Print SHA sums to GHA logs](https://github.com/yt-dlp/yt-dlp/commit/e8032503b9517465b0e86d776fc1e60d8795d673) ([#9582](https://github.com/yt-dlp/yt-dlp/issues/9582)) by [bashonly](https://github.com/bashonly) + - [Update changelog for tarball and sdist](https://github.com/yt-dlp/yt-dlp/commit/17b96974a334688f76b57d350e07cae8cda46877) ([#9425](https://github.com/yt-dlp/yt-dlp/issues/9425)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Standardize `import datetime as dt`](https://github.com/yt-dlp/yt-dlp/commit/c305a25c1b16bcf7a5ec499c3b786ed1e2c748da) ([#8978](https://github.com/yt-dlp/yt-dlp/issues/8978)) by [pukkandan](https://github.com/pukkandan) + - ie: [No `from` stdlib imports in extractors](https://github.com/yt-dlp/yt-dlp/commit/e3a3ed8a981d9395c4859b6ef56cd02bc3148db2) by [pukkandan](https://github.com/pukkandan) + - Miscellaneous: [216f6a3](https://github.com/yt-dlp/yt-dlp/commit/216f6a3cb57824e6a3c859649ce058c199b1b247) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- **docs** + - [Update yt-dlp tagline](https://github.com/yt-dlp/yt-dlp/commit/388c979ac63a8774339fac2516fe1cc852b4276e) ([#9481](https://github.com/yt-dlp/yt-dlp/issues/9481)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + - [Various manpage fixes](https://github.com/yt-dlp/yt-dlp/commit/df0e138fc02ae2764a44f2f59fc93c756c4d3ee2) by [leoheitmannruiz](https://github.com/leoheitmannruiz) +- **test** + - [Workaround websocket server hanging](https://github.com/yt-dlp/yt-dlp/commit/f849d77ab54788446b995d256e1ee0894c4fb927) ([#9467](https://github.com/yt-dlp/yt-dlp/issues/9467)) by [coletdjnz](https://github.com/coletdjnz) + - `traversal`: [Separate traversal tests](https://github.com/yt-dlp/yt-dlp/commit/979ce2e786f2ee3fc783b6dc1ef4188d8805c923) ([#9574](https://github.com/yt-dlp/yt-dlp/issues/9574)) by [Grub4K](https://github.com/Grub4K) + ### 2024.03.10 #### Core changes diff --git a/supportedsites.md b/supportedsites.md index a4b2d57998..ba77c0feb0 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -47,7 +47,7 @@ # Supported sites - **aenetworks:show** - **AeonCo** - **afreecatv**: [*afreecatv*](## "netrc machine") afreecatv.com - - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com + - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com livestreams - **afreecatv:user** - **AirTV** - **AitubeKZVideo** @@ -105,6 +105,7 @@ # Supported sites - **ArteTVPlaylist** - **asobichannel**: ASOBI CHANNEL - **asobichannel:tag**: ASOBI CHANNEL + - **AsobiStage**: ASOBISTAGE (アソビステージ) - **AtresPlayer**: [*atresplayer*](## "netrc machine") - **AtScaleConfEvent** - **ATVAt** @@ -436,6 +437,7 @@ # Supported sites - **FacebookPluginsVideo** - **fancode:live**: [*fancode*](## "netrc machine") (**Currently broken**) - **fancode:vod**: [*fancode*](## "netrc machine") (**Currently broken**) + - **Fathom** - **faz.net** - **fc2**: [*fc2*](## "netrc machine") - **fc2:embed** @@ -633,8 +635,9 @@ # Supported sites - **Jamendo** - **JamendoAlbum** - **JeuxVideo**: (**Currently broken**) - - **JioSaavnAlbum** - - **JioSaavnSong** + - **jiosaavn:album** + - **jiosaavn:playlist** + - **jiosaavn:song** - **Joj** - **JoqrAg**: 超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR) - **Jove** @@ -716,6 +719,8 @@ # Supported sites - **Lnk** - **LnkGo** - **loc**: Library of Congress + - **loom** + - **loom:folder** - **LoveHomePorn** - **LRTStream** - **LRTVOD** @@ -1136,6 +1141,7 @@ # Supported sites - **Radiko** - **RadikoRadio** - **radio.de**: (**Currently broken**) + - **Radio1Be** - **radiocanada** - **radiocanada:audiovideo** - **RadioComercial** @@ -1288,6 +1294,7 @@ # Supported sites - **SeznamZpravyArticle** - **Shahid**: [*shahid*](## "netrc machine") - **ShahidShow** + - **SharePoint** - **ShareVideosEmbed** - **ShemarooMe** - **ShowRoomLive** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 68c3f00e84..22c2c048d8 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.03.10' +__version__ = '2024.04.09' -RELEASE_GIT_HEAD = '615a84447e8322720be77a0e64298d7f42848693' +RELEASE_GIT_HEAD = 'ff07792676f404ffff6ee61b5638c9dc1a33a37a' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.03.10' +_pkg_version = '2024.04.09' From 0c21c53885cf03f4040467ae8c44d7ff51016116 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 13 Apr 2024 11:08:25 -0500 Subject: [PATCH 019/124] [ie/jiosaavn] Extract via API and fix playlists (#9656) Closes #9648 Authored by: bashonly --- yt_dlp/extractor/jiosaavn.py | 106 +++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 43 deletions(-) diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index d7f0a2dba8..35fb3fd6b1 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,10 +1,12 @@ import functools +import math +import re from .common import InfoExtractor from ..utils import ( - format_field, + InAdvancePagedList, + clean_html, int_or_none, - js_to_json, make_archive_id, smuggle_url, unsmuggle_url, @@ -16,6 +18,7 @@ class JioSaavnBaseIE(InfoExtractor): + _API_URL = 'https://www.jiosaavn.com/api.php' _VALID_BITRATES = {'16', '32', '64', '128', '320'} @functools.cached_property @@ -30,7 +33,7 @@ def requested_bitrates(self): def _extract_formats(self, song_data): for bitrate in self.requested_bitrates: media_data = self._download_json( - 'https://www.jiosaavn.com/api.php', song_data['id'], + self._API_URL, song_data['id'], f'Downloading format info for {bitrate}', fatal=False, data=urlencode_postdata({ '__call': 'song.generateAuthToken', @@ -50,31 +53,45 @@ def _extract_formats(self, song_data): 'vcodec': 'none', } - def _extract_song(self, song_data): + def _extract_song(self, song_data, url=None): info = traverse_obj(song_data, { 'id': ('id', {str}), - 'title': ('title', 'text', {str}), - 'album': ('album', 'text', {str}), - 'thumbnail': ('image', 0, {url_or_none}), + 'title': ('song', {clean_html}), + 'album': ('album', {clean_html}), + 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), 'duration': ('duration', {int_or_none}), 'view_count': ('play_count', {int_or_none}), 'release_year': ('year', {int_or_none}), - 'artists': ('artists', lambda _, v: v['role'] == 'singer', 'name', {str}), - 'webpage_url': ('perma_url', {url_or_none}), # for song, playlist extraction + 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), + 'webpage_url': ('perma_url', {url_or_none}), }) - if not info.get('webpage_url'): # for album extraction / fallback - info['webpage_url'] = format_field( - song_data, [('title', 'action')], 'https://www.jiosaavn.com%s') or None - if webpage_url := info['webpage_url']: - info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, url_basename(webpage_url))] + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] return info - def _extract_initial_data(self, url, display_id): - webpage = self._download_webpage(url, display_id) - return self._search_json( - r'window\.__INITIAL_DATA__\s*=', webpage, - 'initial data', display_id, transform_source=js_to_json) + def _call_api(self, type_, token, note='API', params={}): + return self._download_json( + self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', + query={ + '__call': 'webapi.get', + '_format': 'json', + '_marker': '0', + 'ctx': 'web6dot0', + 'token': token, + 'type': type_, + **params, + }) + + def _yield_songs(self, playlist_data): + for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): + song_info = self._extract_song(song_data) + url = smuggle_url(song_info['webpage_url'], { + 'id': song_data['id'], + 'encrypted_media_url': song_data['encrypted_media_url'], + }) + yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) class JioSaavnSongIE(JioSaavnBaseIE): @@ -85,10 +102,11 @@ class JioSaavnSongIE(JioSaavnBaseIE): 'md5': '3b84396d15ed9e083c3106f1fa589c04', 'info_dict': { 'id': 'IcoLuefJ', + 'display_id': 'OQsEfQFVUXk', 'ext': 'm4a', 'title': 'Leja Re', 'album': 'Leja Re', - 'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', 'duration': 205, 'view_count': int, 'release_year': 2018, @@ -111,8 +129,8 @@ def _real_extract(self, url): result = {'id': song_data['id']} else: # only extract metadata if this is not a url_transparent result - song_data = self._extract_initial_data(url, self._match_id(url))['song']['song'] - result = self._extract_song(song_data) + song_data = self._call_api('song', self._match_id(url))['songs'][0] + result = self._extract_song(song_data, url) result['formats'] = list(self._extract_formats(song_data)) return result @@ -130,19 +148,12 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): 'playlist_count': 10, }] - def _entries(self, playlist_data): - for song_data in traverse_obj(playlist_data, ( - 'modules', lambda _, x: x['key'] == 'list', 'data', lambda _, v: v['title']['action'])): - song_info = self._extract_song(song_data) - # album song data is missing artists and release_year, need to re-extract metadata - yield self.url_result(song_info['webpage_url'], JioSaavnSongIE, **song_info) - def _real_extract(self, url): display_id = self._match_id(url) - album_data = self._extract_initial_data(url, display_id)['albumView'] + album_data = self._call_api('album', display_id) return self.playlist_result( - self._entries(album_data), display_id, traverse_obj(album_data, ('album', 'title', 'text', {str}))) + self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) class JioSaavnPlaylistIE(JioSaavnBaseIE): @@ -154,21 +165,30 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE): 'id': 'LlJ8ZWT1ibN5084vKHRj2Q__', 'title': 'Mood English', }, - 'playlist_mincount': 50, + 'playlist_mincount': 301, + }, { + 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-hindi/DVR,pFUOwyXqIp77B1JF,A__', + 'info_dict': { + 'id': 'DVR,pFUOwyXqIp77B1JF,A__', + 'title': 'Mood Hindi', + }, + 'playlist_mincount': 801, }] + _PAGE_SIZE = 50 - def _entries(self, playlist_data): - for song_data in traverse_obj(playlist_data, ('list', lambda _, v: v['perma_url'])): - song_info = self._extract_song(song_data) - url = smuggle_url(song_info['webpage_url'], { - 'id': song_data['id'], - 'encrypted_media_url': song_data['encrypted_media_url'], - }) - yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + def _fetch_page(self, token, page): + return self._call_api( + 'playlist', token, f'playlist page {page}', {'p': page, 'n': self._PAGE_SIZE}) + + def _entries(self, token, first_page_data, page): + page_data = first_page_data if not page else self._fetch_page(token, page + 1) + yield from self._yield_songs(page_data) def _real_extract(self, url): display_id = self._match_id(url) - playlist_data = self._extract_initial_data(url, display_id)['playlist']['playlist'] + playlist_data = self._fetch_page(display_id, 1) + total_pages = math.ceil(int(playlist_data['list_count']) / self._PAGE_SIZE) - return self.playlist_result( - self._entries(playlist_data), display_id, traverse_obj(playlist_data, ('title', 'text', {str}))) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, display_id, playlist_data), + total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) From 315b3544296bb83012e20ee3af9d3cbf5600dd1c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 13 Apr 2024 11:40:53 -0500 Subject: [PATCH 020/124] [ie/afreecatv:live] Add `cdn` extractor-arg (#9666) Closes #6497 Authored by: bashonly --- README.md | 3 ++ yt_dlp/extractor/afreecatv.py | 66 ++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 458541d68e..08afff201a 100644 --- a/README.md +++ b/README.md @@ -1837,6 +1837,9 @@ #### nflplusreplay #### jiosaavn * `bitrate`: Audio bitrates to request. One or more of `16`, `32`, `64`, `128`, `320`. Default is `128,320` +#### afreecatvlive +* `cdn`: One or more CDN IDs to use with the API call for stream URLs, e.g. `gcp_cdn`, `gs_cdn_pc_app`, `gs_cdn_mobile_web`, `gs_cdn_pc_web` + **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 2c33c90dbb..3e5738f6ab 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -8,9 +8,11 @@ determine_ext, filter_dict, int_or_none, + orderedSet, unified_timestamp, url_or_none, urlencode_postdata, + urljoin, ) from ..utils.traversal import traverse_obj @@ -276,6 +278,47 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): }] _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + _WORKING_CDNS = [ + 'gcp_cdn', # live-global-cdn-v02.afreecatv.com + 'gs_cdn_pc_app', # pc-app.stream.afreecatv.com + 'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com + 'gs_cdn_pc_web', # pc-web.stream.afreecatv.com + ] + _BAD_CDNS = [ + 'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve) + 'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400) + 'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve) + 'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve) + 'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400) + ] + + def _extract_formats(self, channel_info, broadcast_no, aid): + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + + # If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs + default_cdn_ids = orderedSet([ + *traverse_obj(channel_info, ('CDN', {str}, all, lambda _, v: v not in self._BAD_CDNS)), + *self._WORKING_CDNS, + ]) + cdn_ids = self._configuration_arg('cdn', default_cdn_ids) + + for attempt, cdn_id in enumerate(cdn_ids, start=1): + m3u8_url = traverse_obj(self._download_json( + urljoin(stream_base_url, 'broad_stream_assign.html'), broadcast_no, + f'Downloading {cdn_id} stream info', f'Unable to download {cdn_id} stream info', + fatal=False, query={ + 'return_type': cdn_id, + 'broad_key': f'{broadcast_no}-common-master-hls', + }), ('view_url', {url_or_none})) + try: + return self._extract_m3u8_formats( + m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid}, + headers={'Referer': 'https://play.afreecatv.com/'}) + except ExtractorError as e: + if attempt == len(cdn_ids): + raise + self.report_warning( + f'{e.cause or e.msg}. Retrying... (attempt {attempt} of {len(cdn_ids)})') def _real_extract(self, url): broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno') @@ -294,7 +337,7 @@ def _real_extract(self, url): 'This livestream is protected by a password, use the --video-password option', expected=True) - aid = self._download_json( + token_info = traverse_obj(self._download_json( self._LIVE_API_URL, broadcast_no, 'Downloading access token for stream', 'Unable to download access token for stream', data=urlencode_postdata(filter_dict({ 'bno': broadcast_no, @@ -302,18 +345,17 @@ def _real_extract(self, url): 'type': 'aid', 'quality': 'master', 'pwd': password, - })))['CHANNEL']['AID'] + }))), ('CHANNEL', {dict})) or {} + aid = token_info.get('AID') + if not aid: + result = token_info.get('RESULT') + if result == 0: + raise ExtractorError('This livestream has ended', expected=True) + elif result == -6: + self.raise_login_required('This livestream is for subscribers only', method='password') + raise ExtractorError('Unable to extract access token') - stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' - stream_info = self._download_json(f'{stream_base_url}/broad_stream_assign.html', broadcast_no, query={ - # works: gs_cdn_pc_app, gs_cdn_mobile_web, gs_cdn_pc_web - 'return_type': 'gs_cdn_pc_app', - 'broad_key': f'{broadcast_no}-common-master-hls', - }, note='Downloading metadata for stream', errnote='Unable to download metadata for stream') - - formats = self._extract_m3u8_formats( - stream_info['view_url'], broadcast_no, 'mp4', m3u8_id='hls', - query={'aid': aid}, headers={'Referer': url}) + formats = self._extract_formats(channel_info, broadcast_no, aid) station_info = traverse_obj(self._download_json( 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, From 02483bea1c4dbe1bace8ca4d19700104fbb8a00f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:11:12 -0500 Subject: [PATCH 021/124] [build] Normalize `curl_cffi` group to `curl-cffi` (#9698) Closes #9682 Authored by: bashonly --- .github/workflows/build.yml | 4 ++-- README.md | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 04536e22c3..ebda09c8ca 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -254,7 +254,7 @@ jobs: # We need to fuse our own universal2 wheels for curl_cffi python3 -m pip install -U --user delocate mkdir curl_cffi_whls curl_cffi_universal2 - python3 devscripts/install_deps.py --print -o --include curl_cffi > requirements.txt + python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do python3 -m pip download \ --only-binary=:all: \ @@ -362,7 +362,7 @@ jobs: - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build - python devscripts/install_deps.py --include py2exe --include curl_cffi + python devscripts/install_deps.py --include py2exe --include curl-cffi python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" - name: Prepare diff --git a/README.md b/README.md index 08afff201a..37da789cf6 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ #### Impersonation The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. * [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) - * Can be installed with the `curl_cffi` group, e.g. `pip install yt-dlp[default,curl_cffi]` + * Can be installed with the `curl-cffi` group, e.g. `pip install yt-dlp[default,curl-cffi]` * Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds diff --git a/pyproject.toml b/pyproject.toml index 9faf53b9c8..5fadd14495 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ dependencies = [ [project.optional-dependencies] default = [] -curl_cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] +curl-cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] secretstorage = [ "cffi", "secretstorage", From c9ce57d9bf51541da2381d99bc096a9d0ddf1f27 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:18:56 -0500 Subject: [PATCH 022/124] [ie/patreon] Fix Vimeo embed extraction (#9712) Fixes regression in 36b240f9a72af57eb2c9d927ebb7fd1c917ebf18 Closes #9709 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 45 ++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index d4f822f52d..9381c7eab8 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,8 +1,8 @@ import itertools +import urllib.parse from .common import InfoExtractor from .vimeo import VimeoIE -from ..compat import compat_urllib_parse_unquote from ..networking.exceptions import HTTPError from ..utils import ( KNOWN_EXTENSIONS, @@ -14,7 +14,6 @@ parse_iso8601, str_or_none, traverse_obj, - try_get, url_or_none, urljoin, ) @@ -199,6 +198,27 @@ class PatreonIE(PatreonBaseIE): 'channel_id': '2147162', 'uploader_url': 'https://www.patreon.com/yaboyroshi', }, + }, { + # NSFW vimeo embed URL + 'url': 'https://www.patreon.com/posts/4k-spiderman-4k-96414599', + 'info_dict': { + 'id': '902250943', + 'ext': 'mp4', + 'title': '❤️(4K) Spiderman Girl Yeonhwa’s Gift ❤️(4K) 스파이더맨걸 연화의 선물', + 'description': '❤️(4K) Spiderman Girl Yeonhwa’s Gift \n❤️(4K) 스파이더맨걸 연화의 선물', + 'uploader': 'Npickyeonhwa', + 'uploader_id': '90574422', + 'uploader_url': 'https://www.patreon.com/Yeonhwa726', + 'channel_id': '10237902', + 'channel_url': 'https://www.patreon.com/Yeonhwa726', + 'duration': 70, + 'timestamp': 1705150153, + 'upload_date': '20240113', + 'comment_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.+', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): @@ -268,16 +288,19 @@ def _real_extract(self, url): }) # handle Vimeo embeds - if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': - embed_html = try_get(attributes, lambda x: x['embed']['html']) - v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) - if v_url: - v_url = VimeoIE._smuggle_referrer(v_url, 'https://patreon.com') - if self._request_webpage(v_url, video_id, 'Checking Vimeo embed URL', fatal=False, errnote=False): - return self.url_result(v_url, VimeoIE, url_transparent=True, **info) + if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': + v_url = urllib.parse.unquote(self._html_search_regex( + r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', + traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '') + if url_or_none(v_url) and self._request_webpage( + v_url, video_id, 'Checking Vimeo embed URL', + headers={'Referer': 'https://patreon.com/'}, + fatal=False, errnote=False): + return self.url_result( + VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), + VimeoIE, url_transparent=True, **info) - embed_url = try_get(attributes, lambda x: x['embed']['url']) + embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): return self.url_result(embed_url, **info) From e3b42d8b1b8bcfff7ba146c19fc3f6f6ba843cea Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 20 Apr 2024 05:23:12 -0500 Subject: [PATCH 023/124] [ie/facebook] Fix DASH formats extraction (#9734) Closes #9720 Authored by: bashonly --- yt_dlp/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 834b1df189..b76407a5c7 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -560,7 +560,7 @@ def extract_from_jsmods_instances(js_data): js_data, lambda x: x['jsmods']['instances'], list) or []) def extract_dash_manifest(video, formats): - dash_manifest = video.get('dash_manifest') + dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str) if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), From 3ee1194288981c4f2c4abd8315326de0c424d2ce Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 21 Apr 2024 13:40:38 +0200 Subject: [PATCH 024/124] [ie] Make `_search_nextjs_data` non fatal (#8937) Authored by: Grub4K --- test/test_InfoExtractor.py | 9 +++++++++ yt_dlp/extractor/asobistage.py | 2 +- yt_dlp/extractor/common.py | 16 ++++++++++------ yt_dlp/extractor/stv.py | 2 +- yt_dlp/extractor/tiktok.py | 2 +- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index b7dee496af..c633ce3e47 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1906,6 +1906,15 @@ def test_response_with_expected_status_returns_content(self): expected_status=TEAPOT_RESPONSE_STATUS) self.assertEqual(content, TEAPOT_RESPONSE_BODY) + def test_search_nextjs_data(self): + data = '' + self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}}) + self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {}) + self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None) + self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {}) + with self.assertRaises(DeprecationWarning): + self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py index b088a1b132..8fa8f3edb6 100644 --- a/yt_dlp/extractor/asobistage.py +++ b/yt_dlp/extractor/asobistage.py @@ -105,7 +105,7 @@ def _real_extract(self, url): video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] webpage = self._download_webpage(url, video_id) event_data = traverse_obj( - self._search_nextjs_data(webpage, video_id, default='{}'), + self._search_nextjs_data(webpage, video_id, default={}), ('props', 'pageProps', 'eventCMSData', { 'title': ('event_name', {str}), 'thumbnail': ('event_thumbnail_image', {url_or_none}), diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 57bbf9bdf1..bebbc6b43f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1738,12 +1738,16 @@ def traverse_json_ld(json_ld, at_top_level=True): traverse_json_ld(json_ld) return filter_dict(info) - def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): - return self._parse_json( - self._search_regex( - r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', - webpage, 'next.js data', fatal=fatal, **kw), - video_id, transform_source=transform_source, fatal=fatal) + def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw): + if default == '{}': + self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead') + default = {} + if default is not NO_DEFAULT: + fatal = False + + return self._search_json( + r']+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', + video_id, end_pattern='', fatal=fatal, default=default, **kw) def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index 8b3e63538c..0ab7801004 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -41,7 +41,7 @@ def _real_extract(self, url): ptype, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id, fatal=False) or '' - props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} + props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {} player_api_cache = try_get( props, lambda x: x['initialReduxState']['playerApiCache']) or {} diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 3f5261ad96..3d965dd452 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -776,7 +776,7 @@ def _real_extract(self, url): status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) - elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'): + elif next_data := self._search_nextjs_data(webpage, video_id, default={}): self.write_debug('Found next.js data') status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) From 8056a3026ed6ec6a6d0ed56fdd7ebcd16e928341 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 21 Apr 2024 11:05:42 -0500 Subject: [PATCH 025/124] [ie/theatercomplextown] Fix extractors (#9754) Authored by: bashonly --- yt_dlp/extractor/stacommu.py | 10 ++++++++-- yt_dlp/extractor/wrestleuniverse.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/stacommu.py b/yt_dlp/extractor/stacommu.py index 1308c595da..d2f207fcc5 100644 --- a/yt_dlp/extractor/stacommu.py +++ b/yt_dlp/extractor/stacommu.py @@ -174,7 +174,7 @@ class TheaterComplexTownBaseIE(StacommuBaseIE): class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): - _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?videos/episodes/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?videos/episodes/(?P\w+)' IE_NAME = 'theatercomplextown:vod' _TESTS = [{ 'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78', @@ -195,6 +195,9 @@ class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): }, { 'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y', 'only_matching': True, + }, { + 'url': 'https://www.theater-complex.town/ja/videos/episodes/hoxqidYNoAn7bP92DN6p78', + 'only_matching': True, }] _API_PATH = 'videoEpisodes' @@ -204,7 +207,7 @@ def _real_extract(self, url): class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): - _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?ppv/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?ppv/(?P\w+)' IE_NAME = 'theatercomplextown:ppv' _TESTS = [{ 'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen', @@ -223,6 +226,9 @@ class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): }, { 'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen', 'only_matching': True, + }, { + 'url': 'https://www.theater-complex.town/ja/ppv/qwUVmLmGEiZ3ZW6it9uGys', + 'only_matching': True, }] _API_PATH = 'events' diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 145246a148..880ee519be 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -147,7 +147,7 @@ def _download_metadata(self, url, video_id, lang, props_keys): metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False) if not metadata: webpage = self._download_webpage(url, video_id) - nextjs_data = self._search_nextjs_data(webpage, video_id) + nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False) metadata = traverse_obj(nextjs_data, ( 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {} return metadata From ff38a011d57b763f3a69bebd25a5dc9044a717ce Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 21 Apr 2024 17:41:40 -0500 Subject: [PATCH 026/124] [ie/crunchyroll] Fix auth and remove cookies support (#9749) Closes #9745 Authored by: bashonly --- yt_dlp/extractor/crunchyroll.py | 134 +++++++++++++++++--------------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 118b575ab2..385a3c2d34 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -24,11 +24,15 @@ class CrunchyrollBaseIE(InfoExtractor): _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' + _REFRESH_TOKEN = None _AUTH_HEADERS = None + _AUTH_EXPIRY = None _API_ENDPOINT = None - _BASIC_AUTH = None + _BASIC_AUTH = 'Basic ' + base64.b64encode(':'.join(( + 't-kdgp2h8c3jub8fn0fq', + 'yfLDfMfrYvKXh4JXS1LEI2cCqu1v5Wan', + )).encode()).decode() _IS_PREMIUM = None - _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q') _LOCALE_LOOKUP = { 'ar': 'ar-SA', 'de': 'de-DE', @@ -43,69 +47,74 @@ class CrunchyrollBaseIE(InfoExtractor): 'hi': 'hi-IN', } - @property - def is_logged_in(self): - return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) + def _set_auth_info(self, response): + CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': response['token_type'] + ' ' + response['access_token']} + CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10) + + def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'): + try: # TODO: Add impersonation support here + return self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote, + headers=headers, data=urlencode_postdata(data)) + except ExtractorError as error: + if not isinstance(error.cause, HTTPError) or error.cause.status != 403: + raise + raise ExtractorError( + 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' + 'and your browser\'s User-Agent (with --user-agent)', expected=True) def _perform_login(self, username, password): - if self.is_logged_in: + if not CrunchyrollBaseIE._REFRESH_TOKEN: + CrunchyrollBaseIE._REFRESH_TOKEN = self.cache.load(self._NETRC_MACHINE, username) + if CrunchyrollBaseIE._REFRESH_TOKEN: return - upsell_response = self._download_json( - f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', - query={ - 'sess_id': 1, - 'device_id': 'whatvalueshouldbeforweb', - 'device_type': 'com.crunchyroll.static', - 'access_token': 'giKq5eY27ny3cqz', - 'referer': f'{self._BASE_URL}/welcome/login' - }) - if upsell_response['code'] != 'ok': - raise ExtractorError('Could not get session id') - session_id = upsell_response['data']['session_id'] - - login_response = self._download_json( - f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urlencode_postdata({ - 'account': username, - 'password': password, - 'session_id': session_id - })) - if login_response['code'] != 'ok': - raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) - if not self.is_logged_in: - raise ExtractorError('Login succeeded but did not set etp_rt cookie') - - def _update_auth(self): - if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): - return - - if not CrunchyrollBaseIE._BASIC_AUTH: - cx_api_param = self._CLIENT_ID[self.is_logged_in] - self.write_debug(f'Using cxApiParam={cx_api_param}') - CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() - - auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH} - if self.is_logged_in: - grant_type = 'etp_rt_cookie' - else: - grant_type = 'client_id' - auth_headers['ETP-Anonymous-ID'] = uuid.uuid4() try: - auth_response = self._download_json( - f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers=auth_headers, data=f'grant_type={grant_type}'.encode()) + login_response = self._request_token( + headers={'Authorization': self._BASIC_AUTH}, data={ + 'username': username, + 'password': password, + 'grant_type': 'password', + 'scope': 'offline_access', + }, note='Logging in', errnote='Failed to log in') except ExtractorError as error: - if isinstance(error.cause, HTTPError) and error.cause.status == 403: - raise ExtractorError( - 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' - 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' - 'and your browser\'s User-Agent (with --user-agent)', expected=True) + if isinstance(error.cause, HTTPError) and error.cause.status == 401: + raise ExtractorError('Invalid username and/or password', expected=True) raise - CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) - CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} - CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) + CrunchyrollBaseIE._REFRESH_TOKEN = login_response['refresh_token'] + self.cache.store(self._NETRC_MACHINE, username, CrunchyrollBaseIE._REFRESH_TOKEN) + self._set_auth_info(login_response) + + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_EXPIRY > time_seconds(): + return + + auth_headers = {'Authorization': self._BASIC_AUTH} + if CrunchyrollBaseIE._REFRESH_TOKEN: + data = { + 'refresh_token': CrunchyrollBaseIE._REFRESH_TOKEN, + 'grant_type': 'refresh_token', + 'scope': 'offline_access', + } + else: + data = {'grant_type': 'client_id'} + auth_headers['ETP-Anonymous-ID'] = uuid.uuid4() + try: + auth_response = self._request_token(auth_headers, data) + except ExtractorError as error: + username, password = self._get_login_info() + if not username or not isinstance(error.cause, HTTPError) or error.cause.status != 400: + raise + self.to_screen('Refresh token has expired. Re-logging in') + CrunchyrollBaseIE._REFRESH_TOKEN = None + self.cache.store(self._NETRC_MACHINE, username, None) + self._perform_login(username, password) + return + + self._set_auth_info(auth_response) def _locale_from_language(self, language): config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) @@ -168,7 +177,8 @@ def _extract_stream(self, identifier, display_id=None): self._update_auth() stream_response = self._download_json( f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', - display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS) + display_id, note='Downloading stream info', errnote='Failed to download stream info', + headers=CrunchyrollBaseIE._AUTH_HEADERS) available_formats = {'': ('', '', stream_response['url'])} for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): @@ -383,9 +393,9 @@ def entries(): if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): message = f'This {object_type} is for premium members only' - if self.is_logged_in: + if CrunchyrollBaseIE._REFRESH_TOKEN: raise ExtractorError(message, expected=True) - self.raise_login_required(message) + self.raise_login_required(message, method='password') result['formats'], result['subtitles'] = self._extract_stream(internal_id) @@ -575,9 +585,9 @@ def _real_extract(self, url): if not self._IS_PREMIUM and response.get('isPremiumOnly'): message = f'This {response.get("type") or "media"} is for premium members only' - if self.is_logged_in: + if CrunchyrollBaseIE._REFRESH_TOKEN: raise ExtractorError(message, expected=True) - self.raise_login_required(message) + self.raise_login_required(message, method='password') result = self._transform_music_response(response) result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) From 89f535e2656964b4061c25a7739d4d6ba0a30568 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:36:01 -0500 Subject: [PATCH 027/124] [ci] Fix `curl-cffi` installation (Bugfix for 02483bea1c4dbe1bace8ca4d19700104fbb8a00f) Authored by: bashonly --- .github/workflows/core.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 076f785bf0..70769f967f 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -53,7 +53,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include dev --include curl_cffi + run: python3 ./devscripts/install_deps.py --include dev --include curl-cffi - name: Run tests continue-on-error: False run: | From 64766459e37451b665c1464073c28361fbcf1c25 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 27 Apr 2024 10:37:26 +0200 Subject: [PATCH 028/124] [core/windows] Improve shell quoting and tests (#9802) Authored by: Grub4K --- test/test_utils.py | 38 ++++++++++++++++++++++++++++---------- yt_dlp/utils/_utils.py | 17 +++++------------ 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index ddf0a7c242..824864577d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2059,7 +2059,22 @@ def test_extract_basic_auth(self): assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') - def test_Popen_windows_escaping(self): + def test_windows_escaping(self): + tests = [ + 'test"&', + '%CMDCMDLINE:~-1%&', + 'a\nb', + '"', + '\\', + '!', + '^!', + 'a \\ b', + 'a \\" b', + 'a \\ b\\', + # We replace \r with \n + ('a\r\ra', 'a\n\na'), + ] + def run_shell(args): stdout, stderr, error = Popen.run( args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -2067,15 +2082,18 @@ def run_shell(args): assert not error return stdout - # Test escaping - assert run_shell(['echo', 'test"&']) == '"test""&"\n' - assert run_shell(['echo', '%CMDCMDLINE:~-1%&']) == '"%CMDCMDLINE:~-1%&"\n' - assert run_shell(['echo', 'a\nb']) == '"a"\n"b"\n' - assert run_shell(['echo', '"']) == '""""\n' - assert run_shell(['echo', '\\']) == '\\\n' - # Test if delayed expansion is disabled - assert run_shell(['echo', '^!']) == '"^!"\n' - assert run_shell('echo "^!"') == '"^!"\n' + for argument in tests: + if isinstance(argument, str): + expected = argument + else: + argument, expected = argument + + args = [sys.executable, '-c', 'import sys; print(end=sys.argv[1])', argument, 'end'] + assert run_shell(args) == expected + + escaped = shell_quote(argument, shell=True) + args = f'{sys.executable} -c "import sys; print(end=sys.argv[1])" {escaped} end' + assert run_shell(args) == expected if __name__ == '__main__': diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index e3e80f3d33..b637669124 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1638,16 +1638,14 @@ def get_filesystem_encoding(): return encoding if encoding is not None else 'utf-8' -_WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'}) +_WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'}) _CMD_QUOTE_TRANS = str.maketrans({ # Keep quotes balanced by replacing them with `""` instead of `\\"` '"': '""', - # Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`) + # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`) # `=` should be unique since variables containing `=` cannot be set using cmd '\n': '%=%', - # While we are only required to escape backslashes immediately before quotes, - # we instead escape all of 'em anyways to be consistent - '\\': '\\\\', + '\r': '%=%', # Use zero length variable replacement so `%` doesn't get expanded # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`) '%': '%%cd:~,%', @@ -1656,19 +1654,14 @@ def get_filesystem_encoding(): def shell_quote(args, *, shell=False): args = list(variadic(args)) - if any(isinstance(item, bytes) for item in args): - deprecation_warning('Passing bytes to utils.shell_quote is deprecated') - encoding = get_filesystem_encoding() - for index, item in enumerate(args): - if isinstance(item, bytes): - args[index] = item.decode(encoding) if compat_os_name != 'nt': return shlex.join(args) trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS return ' '.join( - s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""') + s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) + else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""') for s in args) From 7e26bd53f9c5893518fde81dfd0079ec08dd841e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 28 Apr 2024 15:44:46 +0200 Subject: [PATCH 029/124] [core/windows] Fix tests for `sys.executable` with spaces (Fix for 64766459e37451b665c1464073c28361fbcf1c25) Authored by: Grub4K --- test/test_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 824864577d..816cf03f6b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2090,10 +2090,7 @@ def run_shell(args): args = [sys.executable, '-c', 'import sys; print(end=sys.argv[1])', argument, 'end'] assert run_shell(args) == expected - - escaped = shell_quote(argument, shell=True) - args = f'{sys.executable} -c "import sys; print(end=sys.argv[1])" {escaped} end' - assert run_shell(args) == expected + assert run_shell(shell_quote(args, shell=True)) == expected if __name__ == '__main__': From 1a366403d9c26b992faa77e00f4d02ead57559e3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 28 Apr 2024 10:35:17 -0500 Subject: [PATCH 030/124] [build] Run `macos_legacy` job on `macos-12` (#9804) `macos-latest` has been bumped to `macos-14-arm64` which breaks the builds Authored by: bashonly --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ebda09c8ca..34b504f10d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -300,7 +300,7 @@ jobs: macos_legacy: needs: process if: inputs.macos_legacy - runs-on: macos-latest + runs-on: macos-12 steps: - uses: actions/checkout@v4 From ac817bc83efd939dca3e40c4b527d0ccfc77172b Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 29 Apr 2024 00:19:25 +0200 Subject: [PATCH 031/124] [build] Migrate `linux_exe` to static musl builds (#9811) Authored by: Grub4K, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- .github/workflows/build.yml | 99 +++++++++++++++--------------- bundle/docker/compose.yml | 10 +++ bundle/docker/static/Dockerfile | 21 +++++++ bundle/docker/static/entrypoint.sh | 13 ++++ yt_dlp/update.py | 4 ++ 5 files changed, 97 insertions(+), 50 deletions(-) create mode 100644 bundle/docker/compose.yml create mode 100644 bundle/docker/static/Dockerfile create mode 100755 bundle/docker/static/entrypoint.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 34b504f10d..d9352fedd8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,6 +12,9 @@ on: unix: default: true type: boolean + linux_static: + default: true + type: boolean linux_arm: default: true type: boolean @@ -27,9 +30,6 @@ on: windows32: default: true type: boolean - meta_files: - default: true - type: boolean origin: required: false default: '' @@ -52,7 +52,11 @@ on: default: stable type: string unix: - description: yt-dlp, yt-dlp.tar.gz, yt-dlp_linux, yt-dlp_linux.zip + description: yt-dlp, yt-dlp.tar.gz + default: true + type: boolean + linux_static: + description: yt-dlp_linux default: true type: boolean linux_arm: @@ -75,10 +79,6 @@ on: description: yt-dlp_x86.exe default: true type: boolean - meta_files: - description: SHA2-256SUMS, SHA2-512SUMS, _update_spec - default: true - type: boolean origin: description: Origin required: false @@ -112,27 +112,9 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.10" - - uses: conda-incubator/setup-miniconda@v3 - with: - miniforge-variant: Mambaforge - use-mamba: true - channels: conda-forge - auto-update-conda: true - activate-environment: "" - auto-activate-base: false - name: Install Requirements run: | sudo apt -y install zip pandoc man sed - cat > ./requirements.txt << EOF - python=3.10.* - pyinstaller - brotli-python - EOF - python devscripts/install_deps.py --print \ - --exclude brotli --exclude brotlicffi \ - --include secretstorage >> ./requirements.txt - mamba create -n build --file ./requirements.txt - - name: Prepare run: | python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" @@ -141,30 +123,15 @@ jobs: - name: Build Unix platform-independent binary run: | make all tar - - name: Build Unix standalone binary - shell: bash -l {0} - run: | - unset LD_LIBRARY_PATH # Harmful; set by setup-python - conda activate build - python -m bundle.pyinstaller --onedir - (cd ./dist/yt-dlp_linux && zip -r ../yt-dlp_linux.zip .) - python -m bundle.pyinstaller - mv ./dist/yt-dlp_linux ./yt-dlp_linux - mv ./dist/yt-dlp_linux.zip ./yt-dlp_linux.zip - - name: Verify --update-to if: vars.UPDATE_TO_VERIFICATION run: | - binaries=("yt-dlp" "yt-dlp_linux") - for binary in "${binaries[@]}"; do - chmod +x ./${binary} - cp ./${binary} ./${binary}_downgraded - version="$(./${binary} --version)" - ./${binary}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 - downgraded_version="$(./${binary}_downgraded --version)" - [[ "$version" != "$downgraded_version" ]] - done - + chmod +x ./yt-dlp + cp ./yt-dlp ./yt-dlp_downgraded + version="$(./yt-dlp --version)" + ./yt-dlp_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./yt-dlp_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] - name: Upload artifacts uses: actions/upload-artifact@v4 with: @@ -172,8 +139,39 @@ jobs: path: | yt-dlp yt-dlp.tar.gz - yt-dlp_linux - yt-dlp_linux.zip + compression-level: 0 + + linux_static: + needs: process + if: inputs.linux_static + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build static executable + env: + channel: ${{ inputs.channel }} + origin: ${{ needs.process.outputs.origin }} + version: ${{ inputs.version }} + run: | + mkdir ~/build + cd bundle/docker + docker compose up --build static + sudo chown "${USER}:docker" ~/build/yt-dlp_linux + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ~/build/yt-dlp_linux + cp ~/build/yt-dlp_linux ~/build/yt-dlp_linux_downgraded + version="$(~/build/yt-dlp_linux --version)" + ~/build/yt-dlp_linux_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(~/build/yt-dlp_linux_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + ~/build/yt-dlp_linux compression-level: 0 linux_arm: @@ -447,10 +445,11 @@ jobs: compression-level: 0 meta_files: - if: inputs.meta_files && always() && !cancelled() + if: always() && !cancelled() needs: - process - unix + - linux_static - linux_arm - macos - macos_legacy diff --git a/bundle/docker/compose.yml b/bundle/docker/compose.yml new file mode 100644 index 0000000000..5f89ca6d09 --- /dev/null +++ b/bundle/docker/compose.yml @@ -0,0 +1,10 @@ +services: + static: + build: static + environment: + channel: ${channel} + origin: ${origin} + version: ${version} + volumes: + - ~/build:/build + - ../..:/yt-dlp diff --git a/bundle/docker/static/Dockerfile b/bundle/docker/static/Dockerfile new file mode 100644 index 0000000000..dae2dff3d8 --- /dev/null +++ b/bundle/docker/static/Dockerfile @@ -0,0 +1,21 @@ +FROM alpine:3.19 as base + +RUN apk --update add --no-cache \ + build-base \ + python3 \ + pipx \ + ; + +RUN pipx install pyinstaller +# Requires above step to prepare the shared venv +RUN ~/.local/share/pipx/shared/bin/python -m pip install -U wheel +RUN apk --update add --no-cache \ + scons \ + patchelf \ + binutils \ + ; +RUN pipx install staticx + +WORKDIR /yt-dlp +COPY entrypoint.sh /entrypoint.sh +ENTRYPOINT /entrypoint.sh diff --git a/bundle/docker/static/entrypoint.sh b/bundle/docker/static/entrypoint.sh new file mode 100755 index 0000000000..93d84fa9b7 --- /dev/null +++ b/bundle/docker/static/entrypoint.sh @@ -0,0 +1,13 @@ +#!/bin/ash +set -e + +source ~/.local/share/pipx/venvs/pyinstaller/bin/activate +python -m devscripts.install_deps --include secretstorage +python -m devscripts.make_lazy_extractors +python devscripts/update-version.py -c "${channel}" -r "${origin}" "${version}" +python -m bundle.pyinstaller +deactivate + +source ~/.local/share/pipx/venvs/staticx/bin/activate +staticx /yt-dlp/dist/yt-dlp_linux /build/yt-dlp_linux +deactivate diff --git a/yt_dlp/update.py b/yt_dlp/update.py index f47cbc5b29..ca70f69a7e 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -69,6 +69,10 @@ def _get_variant_and_executable_path(): # Ref: https://en.wikipedia.org/wiki/Uname#Examples if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): machine = '_x86' if platform.architecture()[0][:2] == '32' else '' + # sys.executable returns a /tmp/ path for staticx builds (linux_static) + # Ref: https://staticx.readthedocs.io/en/latest/usage.html#run-time-information + if static_exe_path := os.getenv('STATICX_PROG_PATH'): + path = static_exe_path return f'{remove_end(sys.platform, "32")}{machine}_exe', path path = os.path.dirname(__file__) From c4853655cb9a793129280806af643de43c48f4d5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 11:07:15 -0500 Subject: [PATCH 032/124] [ie/wrestleuniverse] Avoid partial stream formats (#9800) Authored by: bashonly --- yt_dlp/extractor/wrestleuniverse.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 880ee519be..d401d6d39d 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -12,6 +12,7 @@ jwt_decode_hs256, traverse_obj, try_call, + url_basename, url_or_none, urlencode_postdata, variadic, @@ -194,8 +195,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'formats': self._get_formats(video_data, ( - (('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id), + 'formats': self._get_formats(video_data, ('protocolHls', 'url', {url_or_none}), video_id), **traverse_obj(metadata, { 'title': ('displayName', {str}), 'description': ('description', {str}), @@ -259,6 +259,10 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE): 'params': { 'skip_download': 'm3u8', }, + }, { + 'note': 'manifest provides live-a (partial) and live-b (full) streams', + 'url': 'https://www.wrestle-universe.com/en/lives/umc99R9XsexXrxr9VjTo9g', + 'only_matching': True, }] _API_PATH = 'events' @@ -285,12 +289,16 @@ def _real_extract(self, url): video_data, decrypt = self._call_encrypted_api( video_id, ':watchArchive', 'watch archive', data={'method': 1}) - info['formats'] = self._get_formats(video_data, ( - ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) + # 'chromecastUrls' can be only partial videos, avoid + info['formats'] = self._get_formats(video_data, ('hls', (('urls', ...), 'url'), {url_or_none}), video_id) for f in info['formats']: # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values if f.get('tbr'): f['tbr'] = int(f['tbr'] / 2.5) + # prefer variants with the same basename as the master playlist to avoid partial streams + f['format_id'] = url_basename(f['url']).partition('.')[0] + if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]): + f['preference'] = -10 hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) if hls_aes_key: From 231c2eacc41b06b65c63edf94c0d04768a5da607 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 11:14:36 -0500 Subject: [PATCH 033/124] [ie/soundcloud] Extract `genres` (#9821) Authored by: bashonly --- yt_dlp/extractor/soundcloud.py | 50 ++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index c9ed645eb7..c9ca41a5cd 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -361,7 +361,7 @@ def extract_count(key): 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), + 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)), 'formats': formats if not extract_flat else None } @@ -395,10 +395,10 @@ class SoundcloudIE(SoundcloudBaseIE): _TESTS = [ { 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', - 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', + 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2', 'info_dict': { 'id': '62986583', - 'ext': 'mp3', + 'ext': 'opus', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', @@ -411,6 +411,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', + 'uploader_url': 'https://soundcloud.com/ethmusic', + 'genres': [], } }, # geo-restricted @@ -418,7 +421,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '47127627', - 'ext': 'mp3', + 'ext': 'opus', 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', @@ -431,6 +434,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/the-concept-band', + 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', + 'genres': ['Alternative'], }, }, # private link @@ -452,6 +458,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/jaimemf', + 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', + 'genres': ['youtubedl'], }, }, # private link (alt format) @@ -473,6 +482,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/jaimemf', + 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', + 'genres': ['youtubedl'], }, }, # downloadable song @@ -482,6 +494,21 @@ class SoundcloudIE(SoundcloudBaseIE): 'info_dict': { 'id': '343609555', 'ext': 'wav', + 'title': 'The Following', + 'description': '', + 'uploader': '80M', + 'uploader_id': '312384765', + 'uploader_url': 'https://soundcloud.com/the80m', + 'upload_date': '20170922', + 'timestamp': 1506120436, + 'duration': 397.228, + 'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg', + 'license': 'all-rights-reserved', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'view_count': int, + 'genres': ['Dance & EDM'], }, }, # private link, downloadable format @@ -503,6 +530,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg', + 'uploader_url': 'https://soundcloud.com/oriuplift', + 'genres': ['Trance'], }, }, # no album art, use avatar pic for thumbnail @@ -525,6 +555,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/garyvee', + 'genres': [], }, 'params': { 'skip_download': True, @@ -532,13 +564,13 @@ class SoundcloudIE(SoundcloudBaseIE): }, { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', - 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'md5': '8227c3473a4264df6b02ad7e5b7527ac', 'info_dict': { 'id': '583011102', - 'ext': 'mp3', + 'ext': 'opus', 'title': 'Mezzo Valzer', - 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Micronie', + 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a', + 'uploader': 'Giovanni Sarani', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', @@ -549,6 +581,8 @@ class SoundcloudIE(SoundcloudBaseIE): 'like_count': int, 'comment_count': int, 'repost_count': int, + 'genres': ['Piano'], + 'uploader_url': 'https://soundcloud.com/giovannisarani', }, }, { From cb2fb4a643949322adba561ca73bcba3221ec0c5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 11:15:44 -0500 Subject: [PATCH 034/124] [ie/crunchyroll] Always make metadata available (#9772) Closes #9750 Authored by: bashonly --- yt_dlp/extractor/crunchyroll.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 385a3c2d34..a157cddac2 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -394,10 +394,11 @@ def entries(): if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): message = f'This {object_type} is for premium members only' if CrunchyrollBaseIE._REFRESH_TOKEN: - raise ExtractorError(message, expected=True) - self.raise_login_required(message, method='password') - - result['formats'], result['subtitles'] = self._extract_stream(internal_id) + self.raise_no_formats(message, expected=True, video_id=internal_id) + else: + self.raise_login_required(message, method='password', metadata_available=True) + else: + result['formats'], result['subtitles'] = self._extract_stream(internal_id) result['chapters'] = self._extract_chapters(internal_id) @@ -583,14 +584,16 @@ def _real_extract(self, url): if not response: raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + result = self._transform_music_response(response) + if not self._IS_PREMIUM and response.get('isPremiumOnly'): message = f'This {response.get("type") or "media"} is for premium members only' if CrunchyrollBaseIE._REFRESH_TOKEN: - raise ExtractorError(message, expected=True) - self.raise_login_required(message, method='password') - - result = self._transform_music_response(response) - result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) + self.raise_no_formats(message, expected=True, video_id=internal_id) + else: + self.raise_login_required(message, method='password', metadata_available=True) + else: + result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) return result From 036e0d92c6052465673d459678322ea03e61483d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 17:11:11 -0500 Subject: [PATCH 035/124] [ie/patreon] Extract multiple embeds (#9850) Closes #9848 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 134 ++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 51 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 9381c7eab8..6c441ff34c 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -219,7 +219,29 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': r're:^https?://.+', }, 'params': {'skip_download': 'm3u8'}, + }, { + # multiple attachments/embeds + 'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977', + 'playlist_count': 3, + 'info_dict': { + 'id': '100601977', + 'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis', + 'description': 'md5:d099ab976edfce6de2a65c2b169a88d3', + 'uploader': 'Bradley Hall', + 'uploader_id': '24401883', + 'uploader_url': 'https://www.patreon.com/bradleyhallguitar', + 'channel_id': '3193932', + 'channel_url': 'https://www.patreon.com/bradleyhallguitar', + 'channel_follower_count': int, + 'timestamp': 1710777855, + 'upload_date': '20240318', + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:^https?://.+', + }, + 'skip': 'Patron-only content', }] + _RETURN_TYPE = 'video' def _real_extract(self, url): video_id = self._match_id(url) @@ -234,58 +256,54 @@ def _real_extract(self, url): 'include': 'audio,user,user_defined_tags,campaign,attachments_media', }) attributes = post['data']['attributes'] - title = attributes['title'].strip() - image = attributes.get('image') or {} - info = { - 'id': video_id, - 'title': title, - 'description': clean_html(attributes.get('content')), - 'thumbnail': image.get('large_url') or image.get('url'), - 'timestamp': parse_iso8601(attributes.get('published_at')), - 'like_count': int_or_none(attributes.get('like_count')), - 'comment_count': int_or_none(attributes.get('comment_count')), - } - can_view_post = traverse_obj(attributes, 'current_user_can_view') - if can_view_post and info['comment_count']: - info['__post_extractor'] = self.extract_comments(video_id) + info = traverse_obj(attributes, { + 'title': ('title', {str.strip}), + 'description': ('content', {clean_html}), + 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any), + 'timestamp': ('published_at', {parse_iso8601}), + 'like_count': ('like_count', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + }) - for i in post.get('included', []): - i_type = i.get('type') - if i_type == 'media': - media_attributes = i.get('attributes') or {} - download_url = media_attributes.get('download_url') + entries = [] + idx = 0 + for include in traverse_obj(post, ('included', lambda _, v: v['type'])): + include_type = include['type'] + if include_type == 'media': + media_attributes = traverse_obj(include, ('attributes', {dict})) or {} + download_url = url_or_none(media_attributes.get('download_url')) ext = mimetype2ext(media_attributes.get('mimetype')) # if size_bytes is None, this media file is likely unavailable # See: https://github.com/yt-dlp/yt-dlp/issues/4608 size_bytes = int_or_none(media_attributes.get('size_bytes')) if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: - # XXX: what happens if there are multiple attachments? - return { - **info, + idx += 1 + entries.append({ + 'id': f'{video_id}-{idx}', 'ext': ext, 'filesize': size_bytes, 'url': download_url, - } - elif i_type == 'user': - user_attributes = i.get('attributes') - if user_attributes: - info.update({ - 'uploader': user_attributes.get('full_name'), - 'uploader_id': str_or_none(i.get('id')), - 'uploader_url': user_attributes.get('url'), }) - elif i_type == 'post_tag': - info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) + elif include_type == 'user': + info.update(traverse_obj(include, { + 'uploader': ('attributes', 'full_name', {str}), + 'uploader_id': ('id', {str_or_none}), + 'uploader_url': ('attributes', 'url', {url_or_none}), + })) - elif i_type == 'campaign': - info.update({ - 'channel': traverse_obj(i, ('attributes', 'title')), - 'channel_id': str_or_none(i.get('id')), - 'channel_url': traverse_obj(i, ('attributes', 'url')), - 'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), - }) + elif include_type == 'post_tag': + if post_tag := traverse_obj(include, ('attributes', 'value', {str})): + info.setdefault('tags', []).append(post_tag) + + elif include_type == 'campaign': + info.update(traverse_obj(include, { + 'channel': ('attributes', 'title', {str}), + 'channel_id': ('id', {str_or_none}), + 'channel_url': ('attributes', 'url', {url_or_none}), + 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), + })) # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': @@ -296,36 +314,50 @@ def _real_extract(self, url): v_url, video_id, 'Checking Vimeo embed URL', headers={'Referer': 'https://patreon.com/'}, fatal=False, errnote=False): - return self.url_result( + entries.append(self.url_result( VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), - VimeoIE, url_transparent=True, **info) + VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): - return self.url_result(embed_url, **info) + entries.append(self.url_result(embed_url)) - post_file = traverse_obj(attributes, 'post_file') + post_file = traverse_obj(attributes, ('post_file', {dict})) if post_file: name = post_file.get('name') ext = determine_ext(name) if ext in KNOWN_EXTENSIONS: - return { - **info, + entries.append({ + 'id': video_id, 'ext': ext, 'url': post_file['url'], - } + }) elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) - return { - **info, + entries.append({ + 'id': video_id, 'formats': formats, 'subtitles': subtitles, - } + }) - if can_view_post is False: + can_view_post = traverse_obj(attributes, 'current_user_can_view') + comments = None + if can_view_post and info.get('comment_count'): + comments = self.extract_comments(video_id) + + if not entries and can_view_post is False: self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) - else: + elif not entries: self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) + elif len(entries) == 1: + info.update(entries[0]) + else: + for entry in entries: + entry.update(info) + return self.playlist_result(entries, video_id, **info, __post_extractor=comments) + + info['id'] = video_id + info['__post_extractor'] = comments return info def _get_comments(self, post_id): From bec9a59e8ec82c18e3bf9268eaa436793dd52e35 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 4 May 2024 17:19:42 -0500 Subject: [PATCH 036/124] [networking] Add `extensions` attribute to `Response` (#9756) CurlCFFIRH now provides an `impersonate` field in its responses' extensions Authored by: bashonly --- test/test_networking.py | 19 +++++++++++++++++++ yt_dlp/networking/_curlcffi.py | 10 ++++++++++ yt_dlp/networking/common.py | 6 +++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/test/test_networking.py b/test/test_networking.py index b50f70d086..d613cb5681 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -785,6 +785,25 @@ def test_supported_impersonate_targets(self, handler): assert res.status == 200 assert std_headers['user-agent'].lower() not in res.read().decode().lower() + def test_response_extensions(self, handler): + with handler() as rh: + for target in rh.supported_targets: + request = Request( + f'http://127.0.0.1:{self.http_port}/gen_200', extensions={'impersonate': target}) + res = validate_and_send(rh, request) + assert res.extensions['impersonate'] == rh._get_request_target(request) + + def test_http_error_response_extensions(self, handler): + with handler() as rh: + for target in rh.supported_targets: + request = Request( + f'http://127.0.0.1:{self.http_port}/gen_404', extensions={'impersonate': target}) + try: + validate_and_send(rh, request) + except HTTPError as e: + res = e.response + assert res.extensions['impersonate'] == rh._get_request_target(request) + class TestRequestHandlerMisc: """Misc generic tests for request handlers, not related to request or validation testing""" diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index 39d1f70fb0..10751a1050 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -132,6 +132,16 @@ def _check_extensions(self, extensions): extensions.pop('cookiejar', None) extensions.pop('timeout', None) + def send(self, request: Request) -> Response: + target = self._get_request_target(request) + try: + response = super().send(request) + except HTTPError as e: + e.response.extensions['impersonate'] = target + raise + response.extensions['impersonate'] = target + return response + def _send(self, request: Request): max_redirects_exceeded = False session: curl_cffi.requests.Session = self._get_instance( diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 4c66ba66aa..a2217034c9 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -497,6 +497,7 @@ class Response(io.IOBase): @param headers: response headers. @param status: Response HTTP status code. Default is 200 OK. @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided. + @param extensions: Dictionary of handler-specific response extensions. """ def __init__( @@ -505,7 +506,9 @@ def __init__( url: str, headers: Mapping[str, str], status: int = 200, - reason: str = None): + reason: str = None, + extensions: dict = None + ): self.fp = fp self.headers = Message() @@ -517,6 +520,7 @@ def __init__( self.reason = reason or HTTPStatus(status).phrase except ValueError: self.reason = None + self.extensions = extensions or {} def readable(self): return self.fp.readable() From 96da9525043f78aca4544d01761b13b2140e9ae6 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Sun, 5 May 2024 00:44:08 +0200 Subject: [PATCH 037/124] [core] Warn if lack of ffmpeg alters format selection (#9805) Authored by: seproDev, pukkandan --- yt_dlp/YoutubeDL.py | 53 +++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9f730d0384..e0d58f0f49 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2136,6 +2136,11 @@ def _filter(f): def _check_formats(self, formats): for f in formats: + working = f.get('__working') + if working is not None: + if working: + yield f + continue self.to_screen('[info] Testing format %s' % f['format_id']) path = self.get_output_path('temp') if not self._ensure_dir_exists(f'{path}/'): @@ -2152,33 +2157,44 @@ def _check_formats(self, formats): os.remove(temp_file.name) except OSError: self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + f['__working'] = success if success: yield f else: self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + def _select_formats(self, formats, selector): + return list(selector({ + 'formats': formats, + 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), + 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video + or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio + })) + def _default_format_spec(self, info_dict, download=True): + download = download and not self.params.get('simulate') + prefer_best = download and ( + self.params['outtmpl']['default'] == '-' + or info_dict.get('is_live') and not self.params.get('live_from_start')) def can_merge(): merger = FFmpegMergerPP(self) return merger.available and merger.can_merge() - prefer_best = ( - not self.params.get('simulate') - and download - and ( - not can_merge() - or info_dict.get('is_live') and not self.params.get('live_from_start') - or self.params['outtmpl']['default'] == '-')) - compat = ( - prefer_best - or self.params.get('allow_multiple_audio_streams', False) - or 'format-spec' in self.params['compat_opts']) + if not prefer_best and download and not can_merge(): + prefer_best = True + formats = self._get_formats(info_dict) + evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec)) + if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'): + self.report_warning('ffmpeg not found. The downloaded format may not be the best available. ' + 'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies') - return ( - 'best/bestvideo+bestaudio' if prefer_best - else 'bestvideo*+bestaudio/best' if not compat - else 'bestvideo+bestaudio/best') + compat = (self.params.get('allow_multiple_audio_streams') + or 'format-spec' in self.params['compat_opts']) + + return ('best/bestvideo+bestaudio' if prefer_best + else 'bestvideo+bestaudio/best' if compat + else 'bestvideo*+bestaudio/best') def build_format_selector(self, format_spec): def syntax_error(note, start): @@ -2928,12 +2944,7 @@ def is_wellformed(f): self.write_debug(f'Default format spec: {req_format}') format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector({ - 'formats': formats, - 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), - 'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video - or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio - })) + formats_to_download = self._select_formats(formats, format_selector) if interactive_format_selection and not formats_to_download: self.report_error('Requested format is not available', tb=False, is_error=False) continue From 351368cb9a6731b886a58f5a10fd6b302bbe47be Mon Sep 17 00:00:00 2001 From: The-MAGI <110553776+The-MAGI@users.noreply.github.com> Date: Mon, 6 May 2024 01:57:38 +0300 Subject: [PATCH 038/124] [ie/youporn] Fix extractor (#8827) Closes #7967 Authored by: The-MAGI --- yt_dlp/extractor/youporn.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 6ee0abcae0..6d4e31bf34 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor): 'id': '16290308', 'age_limit': 18, 'categories': [], - 'description': 'md5:00ea70f642f431c379763c17c2f396bc', + 'description': str, # TODO: detect/remove SEO spam description in ytdl backport 'display_id': 'tinderspecial-trailer1', 'duration': 298.0, 'ext': 'mp4', 'upload_date': '20201123', 'uploader': 'Ersties', 'tags': [], - 'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg', - 'timestamp': 1606089600, + 'thumbnail': r're:https://.+\.jpg', + 'timestamp': 1606147564, 'title': 'Tinder In Real Life', 'view_count': int, } @@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor): def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') - definitions = self._download_json( - f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id) + self._set_cookie('.youporn.com', 'age_verified', '1') + webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id) + definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions'] - def get_format_data(data, f): - return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl'])) + def get_format_data(data, stream_type): + info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any)) + if not info_url: + return [] + return traverse_obj( + self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False), + lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl'])) formats = [] # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s @@ -123,10 +129,6 @@ def get_format_data(data, f): f['height'] = height formats.append(f) - webpage = self._download_webpage( - 'http://www.youporn.com/watch/%s' % video_id, display_id, - headers={'Cookie': 'age_verified=1'}) - title = self._html_search_regex( r'(?s)]+class=["\']watchVideoTitle[^>]+>(.+?)', webpage, 'title', default=None) or self._og_search_title( From c8bf48f3a8fa29587e7c73ef5a7710385a5ea725 Mon Sep 17 00:00:00 2001 From: Chris Caruso Date: Sun, 5 May 2024 16:02:24 -0700 Subject: [PATCH 039/124] [ie/cbc.ca:player] Improve `_VALID_URL` (#9866) Closes #9825 Authored by: carusocr --- yt_dlp/extractor/cbc.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index ff320dd683..a4180262b7 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -151,7 +151,7 @@ def _real_extract(self, url): class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' - _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P(?:\d\.)?\d+)' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P(?:\d\.)?\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', @@ -277,6 +277,28 @@ class CBCPlayerIE(InfoExtractor): 'location': 'Canada', 'media_type': 'Full Program', }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/1.7194274', + 'md5': '188b96cf6bdcb2540e178a6caa957128', + 'info_dict': { + 'id': '2334524995812', + 'ext': 'mp4', + 'title': '#TheMoment a rare white spirit moose was spotted in Alberta', + 'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3', + 'timestamp': 1714788791, + 'duration': 77.678, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg', + 'uploader': 'CBCC-NEW', + 'chapters': 'count:0', + 'upload_date': '20240504', + 'categories': 'count:3', + 'series': 'The National', + 'tags': 'count:15', + 'creators': ['encoder'], + 'location': 'Canada', + 'media_type': 'Excerpt', + }, }, { 'url': 'cbcplayer:1.7159484', 'only_matching': True, From 5904853ae5788509fdc4892cb7ecdfa9ae7f78e6 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 5 May 2024 18:15:32 -0500 Subject: [PATCH 040/124] [ie/crunchyroll] Support browser impersonation (#9857) Closes #7442 Authored by: bashonly --- yt_dlp/extractor/crunchyroll.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index a157cddac2..90967c1607 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -53,15 +53,19 @@ def _set_auth_info(self, response): CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10) def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'): - try: # TODO: Add impersonation support here + try: return self._download_json( f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote, - headers=headers, data=urlencode_postdata(data)) + headers=headers, data=urlencode_postdata(data), impersonate=True) except ExtractorError as error: if not isinstance(error.cause, HTTPError) or error.cause.status != 403: raise + if target := error.cause.response.extensions.get('impersonate'): + raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"') raise ExtractorError( - 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'Request blocked by Cloudflare. ' + 'Install the required impersonation dependency if possible, ' + 'or else navigate to Crunchyroll in your browser, ' 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' 'and your browser\'s User-Agent (with --user-agent)', expected=True) From 145dc6f6563e80d2da1b3e9aea2ffa795b71622c Mon Sep 17 00:00:00 2001 From: Rasmus Antons Date: Wed, 8 May 2024 22:16:32 +0200 Subject: [PATCH 041/124] [ie/boosty] Add cookies support (#9522) Closes #9401 Authored by: RasmusAntons --- yt_dlp/extractor/boosty.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/boosty.py b/yt_dlp/extractor/boosty.py index fb14ca1467..d3aab7a1a8 100644 --- a/yt_dlp/extractor/boosty.py +++ b/yt_dlp/extractor/boosty.py @@ -1,7 +1,11 @@ +import json +import urllib.parse + from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( ExtractorError, + bug_reports_message, int_or_none, qualities, str_or_none, @@ -162,9 +166,19 @@ def _extract_formats(self, player_urls, video_id): def _real_extract(self, url): user, post_id = self._match_valid_url(url).group('user', 'post_id') + + auth_headers = {} + auth_cookie = self._get_cookies('https://boosty.to/').get('auth') + if auth_cookie is not None: + try: + auth_data = json.loads(urllib.parse.unquote(auth_cookie.value)) + auth_headers['Authorization'] = f'Bearer {auth_data["accessToken"]}' + except (json.JSONDecodeError, KeyError): + self.report_warning(f'Failed to extract token from auth cookie{bug_reports_message()}') + post = self._download_json( f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id, - note='Downloading post data', errnote='Unable to download post data') + note='Downloading post data', errnote='Unable to download post data', headers=auth_headers) post_title = post.get('title') if not post_title: @@ -202,7 +216,9 @@ def _real_extract(self, url): 'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}), }, get_all=False)}) - if not entries: + if not entries and not post.get('hasAccess'): + self.raise_login_required('This post requires a subscription', metadata_available=True) + elif not entries: raise ExtractorError('No videos found', expected=True) if len(entries) == 1: return entries[0] From b38018b781b062d5169d104ab430489aef8e7f1e Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Wed, 8 May 2024 20:51:16 +0000 Subject: [PATCH 042/124] [ie/mixch] Extract comments (#9860) Authored by: pzhlkj6612 --- yt_dlp/extractor/mixch.py | 41 +++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index b980fd01a8..58c4a23018 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -1,6 +1,12 @@ from .common import InfoExtractor from ..networking.exceptions import HTTPError -from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none +from ..utils import ( + ExtractorError, + UserNotLive, + int_or_none, + str_or_none, + url_or_none, +) from ..utils.traversal import traverse_obj @@ -9,17 +15,20 @@ class MixchIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P\d+)' _TESTS = [{ - 'url': 'https://mixch.tv/u/16236849/live', + 'url': 'https://mixch.tv/u/16943797/live', 'skip': 'don\'t know if this live persists', 'info_dict': { - 'id': '16236849', - 'title': '24配信シェア⭕️投票🙏💦', - 'comment_count': 13145, - 'view_count': 28348, - 'timestamp': 1636189377, - 'uploader': '🦥伊咲👶🏻#フレアワ', - 'uploader_id': '16236849', - } + 'id': '16943797', + 'ext': 'mp4', + 'title': '#EntView #カリナ #セブチ 2024-05-05 06:58', + 'comment_count': int, + 'view_count': int, + 'timestamp': 1714726805, + 'uploader': 'Ent.View K-news🎶💕', + 'uploader_id': '16943797', + 'live_status': 'is_live', + 'upload_date': '20240503', + }, }, { 'url': 'https://mixch.tv/u/16137876/live', 'only_matching': True, @@ -48,8 +57,20 @@ def _real_extract(self, url): 'protocol': 'm3u8', }], 'is_live': True, + '__post_extractor': self.extract_comments(video_id), } + def _get_comments(self, video_id): + yield from traverse_obj(self._download_json( + f'https://mixch.tv/api-web/lives/{video_id}/messages', video_id, + note='Downloading comments', errnote='Failed to download comments'), (..., { + 'author': ('name', {str}), + 'author_id': ('user_id', {str_or_none}), + 'id': ('message_id', {str}, {lambda x: x or None}), + 'text': ('body', {str}), + 'timestamp': ('created', {int}), + })) + class MixchArchiveIE(InfoExtractor): IE_NAME = 'mixch:archive' From df5c9e733aaba703cf285c0372b6d61629330c82 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Wed, 8 May 2024 23:02:22 +0200 Subject: [PATCH 043/124] [ie/vk] Improve format extraction (#9885) Closes #5675 Authored by: seproDev --- yt_dlp/extractor/vk.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 7e3a3a9a98..28d5026850 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -451,6 +451,7 @@ def _real_extract(self, url): info_page, 'view count', default=None)) formats = [] + subtitles = {} for format_id, format_url in data.items(): format_url = url_or_none(format_url) if not format_url or not format_url.startswith(('http', '//', 'rtmp')): @@ -462,12 +463,21 @@ def _real_extract(self, url): formats.append({ 'format_id': format_id, 'url': format_url, + 'ext': 'mp4', + 'source_preference': 1, 'height': height, }) elif format_id == 'hls': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( format_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False, live=is_live)) + m3u8_id=format_id, fatal=False, live=is_live) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_id.startswith('dash_'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif format_id == 'rtmp': formats.append({ 'format_id': format_id, @@ -475,7 +485,6 @@ def _real_extract(self, url): 'ext': 'flv', }) - subtitles = {} for sub in data.get('subs') or {}: subtitles.setdefault(sub.get('lang', 'en'), []).append({ 'ext': sub.get('title', '.srt').split('.')[-1], @@ -496,6 +505,7 @@ def _real_extract(self, url): 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, 'subtitles': subtitles, + '_format_sort_fields': ('res', 'source'), } From 06d52c87314e0bbc16c43c405090843885577b88 Mon Sep 17 00:00:00 2001 From: fireattack Date: Thu, 9 May 2024 05:09:38 +0800 Subject: [PATCH 044/124] [ie/BilibiliSpaceVideo] Better error message (#9839) Closes #9528 Authored by: fireattack --- yt_dlp/extractor/bilibili.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index fee4b29940..6221e9a51e 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1049,9 +1049,10 @@ def fetch_page(page_idx): raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise - if response['code'] == -401: + if response['code'] in (-352, -401): raise ExtractorError( - 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) + f'Request is blocked by server ({-response["code"]}), ' + 'please add cookies, wait and try later.', expected=True) return response['data'] def get_metadata(page_data): From 2338827072dacab0f15348b70aec8685feefc8d1 Mon Sep 17 00:00:00 2001 From: fireattack Date: Thu, 9 May 2024 05:24:44 +0800 Subject: [PATCH 045/124] [ie/bilibili] Fix `--geo-verification-proxy` support (#9817) Closes #9797 Authored by: fireattack --- yt_dlp/extractor/bilibili.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 6221e9a51e..df34700033 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -93,11 +93,11 @@ def extract_formats(self, play_info): return formats - def _download_playinfo(self, video_id, cid): + def _download_playinfo(self, video_id, cid, headers=None): return self._download_json( 'https://api.bilibili.com/x/player/playurl', video_id, query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, - note=f'Downloading video formats for cid {cid}')['data'] + note=f'Downloading video formats for cid {cid}', headers=headers)['data'] def json2srt(self, json_data): srt_data = '' @@ -493,7 +493,8 @@ class BiliBiliIE(BilibiliBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, video_id) + headers = self.geo_verification_headers() + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers) if not self._match_valid_url(urlh.url): return self.url_result(urlh.url) @@ -531,7 +532,7 @@ def _real_extract(self, url): self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, - note='Extracting videos in anthology'), + note='Extracting videos in anthology', headers=headers), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 @@ -552,7 +553,7 @@ def _real_extract(self, url): festival_info = {} if is_festival: - play_info = self._download_playinfo(video_id, cid) + play_info = self._download_playinfo(video_id, cid, headers=headers) festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), @@ -666,14 +667,15 @@ class BiliBiliBangumiIE(BilibiliBaseIE): def _real_extract(self, url): episode_id = self._match_id(url) - webpage = self._download_webpage(url, episode_id) + headers = self.geo_verification_headers() + webpage = self._download_webpage(url, episode_id, headers=headers) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - headers = {'Referer': url, **self.geo_verification_headers()} + headers['Referer'] = url play_info = self._download_json( 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, @@ -724,7 +726,7 @@ def _real_extract(self, url): 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), '__post_extractor': self.extract_comments(aid), - 'http_headers': headers, + 'http_headers': {'Referer': url}, } From c4b87dd885ee5391e5f481e7c8bd550a7c543623 Mon Sep 17 00:00:00 2001 From: src-tinkerer <149616646+src-tinkerer@users.noreply.github.com> Date: Wed, 8 May 2024 21:27:30 +0000 Subject: [PATCH 046/124] [ie/ZenYandex] Fix extractor (#9813) Closes #9803 Authored by: src-tinkerer --- yt_dlp/extractor/yandexvideo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index 4382a5684a..95a9446e30 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -259,15 +259,15 @@ def _real_extract(self, url): webpage = self._download_webpage(redirect, video_id, note='Redirecting') data_json = self._search_json( r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') - serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', - webpage, 'server state').replace('State', 'Settings') + serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state') uploader = self._search_regex(r'(]+>)', webpage, 'uploader', default='') uploader_name = extract_attributes(uploader).get('aria-label') - video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) - stream_urls = try_get(video_json, lambda x: x['video']['streams']) + item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str})) + video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {} + formats, subtitles = [], {} - for s_url in stream_urls: + for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})): ext = determine_ext(s_url) if ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') From 6b54cccdcb892bca3e55993480d8b86f1c7e6da6 Mon Sep 17 00:00:00 2001 From: Alexandre Huot Date: Wed, 8 May 2024 18:10:06 -0400 Subject: [PATCH 047/124] [ie/Qub] Fix extractor (#7019) Closes #4989 Authored by: alexhuot1, dirkf --- yt_dlp/extractor/tva.py | 44 +++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/tva.py b/yt_dlp/extractor/tva.py index 9afe233284..e3e10557c2 100644 --- a/yt_dlp/extractor/tva.py +++ b/yt_dlp/extractor/tva.py @@ -1,10 +1,9 @@ +import functools +import re + from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - smuggle_url, - strip_or_none, -) +from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none +from ..utils.traversal import traverse_obj class TVAIE(InfoExtractor): @@ -49,11 +48,20 @@ class QubIE(InfoExtractor): 'info_dict': { 'id': '6084352463001', 'ext': 'mp4', - 'title': 'Épisode 01', + 'title': 'Ép 01. Mon dernier jour', 'uploader_id': '5481942443001', 'upload_date': '20190907', 'timestamp': 1567899756, 'description': 'md5:9c0d7fbb90939420c651fd977df90145', + 'thumbnail': r're:https://.+\.jpg', + 'episode': 'Ép 01. Mon dernier jour', + 'episode_number': 1, + 'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'], + 'duration': 2625.963, + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Alerte Amber', + 'channel': 'TVA', }, }, { 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', @@ -64,22 +72,24 @@ class QubIE(InfoExtractor): def _real_extract(self, url): entity_id = self._match_id(url) - entity = self._download_json( - 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', - entity_id, query={'id': entity_id}) + webpage = self._download_webpage(url, entity_id) + entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData'] video_id = entity['videoId'] episode = strip_or_none(entity.get('name')) return { '_type': 'url_transparent', + 'url': f'https://videos.tva.ca/details/_{video_id}', + 'ie_key': TVAIE.ie_key(), 'id': video_id, 'title': episode, - # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'], - 'url': 'https://videos.tva.ca/details/_' + video_id, - 'description': entity.get('longDescription'), - 'duration': float_or_none(entity.get('durationMillis'), 1000), 'episode': episode, - 'episode_number': int_or_none(entity.get('episodeNumber')), - # 'ie_key': 'BrightcoveNew', - 'ie_key': TVAIE.ie_key(), + **traverse_obj(entity, { + 'description': ('longDescription', {str}), + 'duration': ('durationMillis', {functools.partial(float_or_none, scale=1000)}), + 'channel': ('knownEntities', 'channel', 'name', {str}), + 'series': ('knownEntities', 'videoShow', 'name', {str}), + 'season_number': ('slug', {lambda x: re.search(r'/s(?:ai|ea)son-(\d+)/', x)}, 1, {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + }), } From 73f12119b52d98281804b0c072b2ed6aa841ec88 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Fri, 10 May 2024 17:13:35 +0000 Subject: [PATCH 048/124] [ie/netease:program] Improve `--no-playlist` message (#9488) Authored by: pzhlkj6612 --- yt_dlp/extractor/neteasemusic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 73b33a9f94..b54c12e1e2 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -561,7 +561,8 @@ def _real_extract(self, url): 'timestamp': ('createTime', {self.kilo_or_none}), }) - if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): + if not self._yes_playlist( + info['songs'] and program_id, info['mainSong']['id'], playlist_label='program', video_label='song'): formats = self.extract_formats(info['mainSong']) return { From 00a9f2e1f7fa69499221f2e8dd73a08efeef79bc Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Sat, 11 May 2024 01:19:57 +0800 Subject: [PATCH 049/124] [ie/canalalpha] Fix extractor (#9675) Authored by: kclauhk --- yt_dlp/extractor/canalalpha.py | 35 +++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/canalalpha.py b/yt_dlp/extractor/canalalpha.py index df5ca58187..745e6954c7 100644 --- a/yt_dlp/extractor/canalalpha.py +++ b/yt_dlp/extractor/canalalpha.py @@ -40,7 +40,7 @@ class CanalAlphaIE(InfoExtractor): 'id': '24484', 'ext': 'mp4', 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', - 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'description': 'md5:85d594a3b5dc6ccfc4a85aba6e73b129', 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', 'upload_date': '20211026', 'duration': 360, @@ -58,14 +58,25 @@ class CanalAlphaIE(InfoExtractor): 'duration': 360, }, 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/33500/encore-des-mesures-deconomie-dans-le-jura', + 'info_dict': { + 'id': '33500', + 'ext': 'mp4', + 'title': 'Encore des mesures d\'économie dans le Jura', + 'description': 'md5:938b5b556592f2d1b9ab150268082a80', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_46665.jpg', + 'upload_date': '20240411', + 'duration': 105, + }, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) data_json = self._parse_json(self._search_regex( r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', - webpage, 'data_json'), id)['1']['data']['data'] + webpage, 'data_json'), video_id)['1']['data']['data'] manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} subtitles = {} formats = [{ @@ -75,15 +86,17 @@ def _real_extract(self, url): 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] if manifests.get('hls'): - m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) - formats.extend(m3u8_frmts) - subtitles = self._merge_subtitles(subtitles, m3u8_subs) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + manifests['hls'], video_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) if manifests.get('dash'): - dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) - formats.extend(dash_frmts) - subtitles = self._merge_subtitles(subtitles, dash_subs) + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifests['dash'], video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { - 'id': id, + 'id': video_id, 'title': data_json.get('title').strip(), 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), 'thumbnail': data_json.get('poster'), From 98d71d8c5e5dab08b561ee6f137e968d2a004262 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Fri, 10 May 2024 19:20:55 +0200 Subject: [PATCH 050/124] [ie/commonmistakes] Raise error on blob URLs (#9897) Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 6 +++++- yt_dlp/extractor/commonmistakes.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42034275b9..1f095c932a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -387,7 +387,11 @@ ComedyCentralIE, ComedyCentralTVIE, ) -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonmistakes import ( + BlobIE, + CommonMistakesIE, + UnicodeBOMIE, +) from .commonprotocols import ( MmsIE, RtmpIE, diff --git a/yt_dlp/extractor/commonmistakes.py b/yt_dlp/extractor/commonmistakes.py index 1d3b61c732..4514424e8e 100644 --- a/yt_dlp/extractor/commonmistakes.py +++ b/yt_dlp/extractor/commonmistakes.py @@ -40,3 +40,19 @@ def _real_extract(self, url): 'Your URL starts with a Byte Order Mark (BOM). ' 'Removing the BOM and looking for "%s" ...' % real_url) return self.url_result(real_url) + + +class BlobIE(InfoExtractor): + IE_DESC = False + _VALID_URL = r'blob:' + + _TESTS = [{ + 'url': 'blob:https://www.youtube.com/4eb3d090-a761-46e6-8083-c32016a36e3b', + 'only_matching': True, + }] + + def _real_extract(self, url): + raise ExtractorError( + 'You\'ve asked yt-dlp to download a blob URL. ' + 'A blob URL exists only locally in your browser. ' + 'It is not possible for yt-dlp to access it.', expected=True) From 3c7a287e281d9f9a353dce8902ff78a84c24a040 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 11 May 2024 10:06:58 +1200 Subject: [PATCH 051/124] [test] Add HTTP proxy tests (#9578) Also fixes HTTPS proxies for curl_cffi Authored by: coletdjnz --- test/conftest.py | 50 ++++- test/helper.py | 5 + test/test_http_proxy.py | 379 +++++++++++++++++++++++++++++++++ test/test_networking.py | 271 ++++++++++------------- test/test_websockets.py | 55 +++-- yt_dlp/networking/_curlcffi.py | 14 +- 6 files changed, 595 insertions(+), 179 deletions(-) create mode 100644 test/test_http_proxy.py diff --git a/test/conftest.py b/test/conftest.py index 2fbc269e1f..decd2c85c8 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,4 +1,3 @@ -import functools import inspect import pytest @@ -10,7 +9,9 @@ @pytest.fixture def handler(request): - RH_KEY = request.param + RH_KEY = getattr(request, 'param', None) + if not RH_KEY: + return if inspect.isclass(RH_KEY) and issubclass(RH_KEY, RequestHandler): handler = RH_KEY elif RH_KEY in _REQUEST_HANDLERS: @@ -18,9 +19,46 @@ def handler(request): else: pytest.skip(f'{RH_KEY} request handler is not available') - return functools.partial(handler, logger=FakeLogger) + class HandlerWrapper(handler): + RH_KEY = handler.RH_KEY + + def __init__(self, *args, **kwargs): + super().__init__(logger=FakeLogger, *args, **kwargs) + + return HandlerWrapper -def validate_and_send(rh, req): - rh.validate(req) - return rh.send(req) +@pytest.fixture(autouse=True) +def skip_handler(request, handler): + """usage: pytest.mark.skip_handler('my_handler', 'reason')""" + for marker in request.node.iter_markers('skip_handler'): + if marker.args[0] == handler.RH_KEY: + pytest.skip(marker.args[1] if len(marker.args) > 1 else '') + + +@pytest.fixture(autouse=True) +def skip_handler_if(request, handler): + """usage: pytest.mark.skip_handler_if('my_handler', lambda request: True, 'reason')""" + for marker in request.node.iter_markers('skip_handler_if'): + if marker.args[0] == handler.RH_KEY and marker.args[1](request): + pytest.skip(marker.args[2] if len(marker.args) > 2 else '') + + +@pytest.fixture(autouse=True) +def skip_handlers_if(request, handler): + """usage: pytest.mark.skip_handlers_if(lambda request, handler: True, 'reason')""" + for marker in request.node.iter_markers('skip_handlers_if'): + if handler and marker.args[0](request, handler): + pytest.skip(marker.args[1] if len(marker.args) > 1 else '') + + +def pytest_configure(config): + config.addinivalue_line( + "markers", "skip_handler(handler): skip test for the given handler", + ) + config.addinivalue_line( + "markers", "skip_handler_if(handler): skip test for the given handler if condition is true" + ) + config.addinivalue_line( + "markers", "skip_handlers_if(handler): skip test for handlers when the condition is true" + ) diff --git a/test/helper.py b/test/helper.py index 7760fd8d7f..e7473120d1 100644 --- a/test/helper.py +++ b/test/helper.py @@ -338,3 +338,8 @@ def http_server_port(httpd): def verify_address_availability(address): if find_available_port(address) is None: pytest.skip(f'Unable to bind to source address {address} (address may not exist)') + + +def validate_and_send(rh, req): + rh.validate(req) + return rh.send(req) diff --git a/test/test_http_proxy.py b/test/test_http_proxy.py new file mode 100644 index 0000000000..c1d7c53f51 --- /dev/null +++ b/test/test_http_proxy.py @@ -0,0 +1,379 @@ +import abc +import base64 +import contextlib +import functools +import json +import os +import random +import ssl +import threading +from http.server import BaseHTTPRequestHandler +from socketserver import ThreadingTCPServer + +import pytest + +from test.helper import http_server_port, verify_address_availability +from test.test_networking import TEST_DIR +from test.test_socks import IPv6ThreadingTCPServer +from yt_dlp.dependencies import urllib3 +from yt_dlp.networking import Request +from yt_dlp.networking.exceptions import HTTPError, ProxyError, SSLError + + +class HTTPProxyAuthMixin: + + def proxy_auth_error(self): + self.send_response(407) + self.send_header('Proxy-Authenticate', 'Basic realm="test http proxy"') + self.end_headers() + return False + + def do_proxy_auth(self, username, password): + if username is None and password is None: + return True + + proxy_auth_header = self.headers.get('Proxy-Authorization', None) + if proxy_auth_header is None: + return self.proxy_auth_error() + + if not proxy_auth_header.startswith('Basic '): + return self.proxy_auth_error() + + auth = proxy_auth_header[6:] + + try: + auth_username, auth_password = base64.b64decode(auth).decode().split(':', 1) + except Exception: + return self.proxy_auth_error() + + if auth_username != (username or '') or auth_password != (password or ''): + return self.proxy_auth_error() + return True + + +class HTTPProxyHandler(BaseHTTPRequestHandler, HTTPProxyAuthMixin): + def __init__(self, *args, proxy_info=None, username=None, password=None, request_handler=None, **kwargs): + self.username = username + self.password = password + self.proxy_info = proxy_info + super().__init__(*args, **kwargs) + + def do_GET(self): + if not self.do_proxy_auth(self.username, self.password): + self.server.close_request(self.request) + return + if self.path.endswith('/proxy_info'): + payload = json.dumps(self.proxy_info or { + 'client_address': self.client_address, + 'connect': False, + 'connect_host': None, + 'connect_port': None, + 'headers': dict(self.headers), + 'path': self.path, + 'proxy': ':'.join(str(y) for y in self.connection.getsockname()), + }) + self.send_response(200) + self.send_header('Content-Type', 'application/json; charset=utf-8') + self.send_header('Content-Length', str(len(payload))) + self.end_headers() + self.wfile.write(payload.encode()) + else: + self.send_response(404) + self.end_headers() + + self.server.close_request(self.request) + + +if urllib3: + import urllib3.util.ssltransport + + class SSLTransport(urllib3.util.ssltransport.SSLTransport): + """ + Modified version of urllib3 SSLTransport to support server side SSL + + This allows us to chain multiple TLS connections. + """ + def __init__(self, socket, ssl_context, server_hostname=None, suppress_ragged_eofs=True, server_side=False): + self.incoming = ssl.MemoryBIO() + self.outgoing = ssl.MemoryBIO() + + self.suppress_ragged_eofs = suppress_ragged_eofs + self.socket = socket + + self.sslobj = ssl_context.wrap_bio( + self.incoming, + self.outgoing, + server_hostname=server_hostname, + server_side=server_side + ) + self._ssl_io_loop(self.sslobj.do_handshake) + + @property + def _io_refs(self): + return self.socket._io_refs + + @_io_refs.setter + def _io_refs(self, value): + self.socket._io_refs = value + + def shutdown(self, *args, **kwargs): + self.socket.shutdown(*args, **kwargs) +else: + SSLTransport = None + + +class HTTPSProxyHandler(HTTPProxyHandler): + def __init__(self, request, *args, **kwargs): + certfn = os.path.join(TEST_DIR, 'testcert.pem') + sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + sslctx.load_cert_chain(certfn, None) + if isinstance(request, ssl.SSLSocket): + request = SSLTransport(request, ssl_context=sslctx, server_side=True) + else: + request = sslctx.wrap_socket(request, server_side=True) + super().__init__(request, *args, **kwargs) + + +class HTTPConnectProxyHandler(BaseHTTPRequestHandler, HTTPProxyAuthMixin): + protocol_version = 'HTTP/1.1' + default_request_version = 'HTTP/1.1' + + def __init__(self, *args, username=None, password=None, request_handler=None, **kwargs): + self.username = username + self.password = password + self.request_handler = request_handler + super().__init__(*args, **kwargs) + + def do_CONNECT(self): + if not self.do_proxy_auth(self.username, self.password): + self.server.close_request(self.request) + return + self.send_response(200) + self.end_headers() + proxy_info = { + 'client_address': self.client_address, + 'connect': True, + 'connect_host': self.path.split(':')[0], + 'connect_port': int(self.path.split(':')[1]), + 'headers': dict(self.headers), + 'path': self.path, + 'proxy': ':'.join(str(y) for y in self.connection.getsockname()), + } + self.request_handler(self.request, self.client_address, self.server, proxy_info=proxy_info) + self.server.close_request(self.request) + + +class HTTPSConnectProxyHandler(HTTPConnectProxyHandler): + def __init__(self, request, *args, **kwargs): + certfn = os.path.join(TEST_DIR, 'testcert.pem') + sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + sslctx.load_cert_chain(certfn, None) + request = sslctx.wrap_socket(request, server_side=True) + self._original_request = request + super().__init__(request, *args, **kwargs) + + def do_CONNECT(self): + super().do_CONNECT() + self.server.close_request(self._original_request) + + +@contextlib.contextmanager +def proxy_server(proxy_server_class, request_handler, bind_ip=None, **proxy_server_kwargs): + server = server_thread = None + try: + bind_address = bind_ip or '127.0.0.1' + server_type = ThreadingTCPServer if '.' in bind_address else IPv6ThreadingTCPServer + server = server_type( + (bind_address, 0), functools.partial(proxy_server_class, request_handler=request_handler, **proxy_server_kwargs)) + server_port = http_server_port(server) + server_thread = threading.Thread(target=server.serve_forever) + server_thread.daemon = True + server_thread.start() + if '.' not in bind_address: + yield f'[{bind_address}]:{server_port}' + else: + yield f'{bind_address}:{server_port}' + finally: + server.shutdown() + server.server_close() + server_thread.join(2.0) + + +class HTTPProxyTestContext(abc.ABC): + REQUEST_HANDLER_CLASS = None + REQUEST_PROTO = None + + def http_server(self, server_class, *args, **kwargs): + return proxy_server(server_class, self.REQUEST_HANDLER_CLASS, *args, **kwargs) + + @abc.abstractmethod + def proxy_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs) -> dict: + """return a dict of proxy_info""" + + +class HTTPProxyHTTPTestContext(HTTPProxyTestContext): + # Standard HTTP Proxy for http requests + REQUEST_HANDLER_CLASS = HTTPProxyHandler + REQUEST_PROTO = 'http' + + def proxy_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs): + request = Request(f'http://{target_domain or "127.0.0.1"}:{target_port or "40000"}/proxy_info', **req_kwargs) + handler.validate(request) + return json.loads(handler.send(request).read().decode()) + + +class HTTPProxyHTTPSTestContext(HTTPProxyTestContext): + # HTTP Connect proxy, for https requests + REQUEST_HANDLER_CLASS = HTTPSProxyHandler + REQUEST_PROTO = 'https' + + def proxy_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs): + request = Request(f'https://{target_domain or "127.0.0.1"}:{target_port or "40000"}/proxy_info', **req_kwargs) + handler.validate(request) + return json.loads(handler.send(request).read().decode()) + + +CTX_MAP = { + 'http': HTTPProxyHTTPTestContext, + 'https': HTTPProxyHTTPSTestContext, +} + + +@pytest.fixture(scope='module') +def ctx(request): + return CTX_MAP[request.param]() + + +@pytest.mark.parametrize( + 'handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) +@pytest.mark.parametrize('ctx', ['http'], indirect=True) # pure http proxy can only support http +class TestHTTPProxy: + def test_http_no_auth(self, handler, ctx): + with ctx.http_server(HTTPProxyHandler) as server_address: + with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert proxy_info['connect'] is False + assert 'Proxy-Authorization' not in proxy_info['headers'] + + def test_http_auth(self, handler, ctx): + with ctx.http_server(HTTPProxyHandler, username='test', password='test') as server_address: + with handler(proxies={ctx.REQUEST_PROTO: f'http://test:test@{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert 'Proxy-Authorization' in proxy_info['headers'] + + def test_http_bad_auth(self, handler, ctx): + with ctx.http_server(HTTPProxyHandler, username='test', password='test') as server_address: + with handler(proxies={ctx.REQUEST_PROTO: f'http://test:bad@{server_address}'}) as rh: + with pytest.raises(HTTPError) as exc_info: + ctx.proxy_info_request(rh) + assert exc_info.value.response.status == 407 + exc_info.value.response.close() + + def test_http_source_address(self, handler, ctx): + with ctx.http_server(HTTPProxyHandler) as server_address: + source_address = f'127.0.0.{random.randint(5, 255)}' + verify_address_availability(source_address) + with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}, + source_address=source_address) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert proxy_info['client_address'][0] == source_address + + @pytest.mark.skip_handler('Urllib', 'urllib does not support https proxies') + def test_https(self, handler, ctx): + with ctx.http_server(HTTPSProxyHandler) as server_address: + with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert proxy_info['connect'] is False + assert 'Proxy-Authorization' not in proxy_info['headers'] + + @pytest.mark.skip_handler('Urllib', 'urllib does not support https proxies') + def test_https_verify_failed(self, handler, ctx): + with ctx.http_server(HTTPSProxyHandler) as server_address: + with handler(verify=True, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh: + # Accept SSLError as may not be feasible to tell if it is proxy or request error. + # note: if request proto also does ssl verification, this may also be the error of the request. + # Until we can support passing custom cacerts to handlers, we cannot properly test this for all cases. + with pytest.raises((ProxyError, SSLError)): + ctx.proxy_info_request(rh) + + def test_http_with_idn(self, handler, ctx): + with ctx.http_server(HTTPProxyHandler) as server_address: + with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh, target_domain='中文.tw') + assert proxy_info['proxy'] == server_address + assert proxy_info['path'].startswith('http://xn--fiq228c.tw') + assert proxy_info['headers']['Host'].split(':', 1)[0] == 'xn--fiq228c.tw' + + +@pytest.mark.parametrize( + 'handler,ctx', [ + ('Requests', 'https'), + ('CurlCFFI', 'https'), + ], indirect=True) +class TestHTTPConnectProxy: + def test_http_connect_no_auth(self, handler, ctx): + with ctx.http_server(HTTPConnectProxyHandler) as server_address: + with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert proxy_info['connect'] is True + assert 'Proxy-Authorization' not in proxy_info['headers'] + + def test_http_connect_auth(self, handler, ctx): + with ctx.http_server(HTTPConnectProxyHandler, username='test', password='test') as server_address: + with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://test:test@{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert 'Proxy-Authorization' in proxy_info['headers'] + + @pytest.mark.skip_handler( + 'Requests', + 'bug in urllib3 causes unclosed socket: https://github.com/urllib3/urllib3/issues/3374' + ) + def test_http_connect_bad_auth(self, handler, ctx): + with ctx.http_server(HTTPConnectProxyHandler, username='test', password='test') as server_address: + with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://test:bad@{server_address}'}) as rh: + with pytest.raises(ProxyError): + ctx.proxy_info_request(rh) + + def test_http_connect_source_address(self, handler, ctx): + with ctx.http_server(HTTPConnectProxyHandler) as server_address: + source_address = f'127.0.0.{random.randint(5, 255)}' + verify_address_availability(source_address) + with handler(proxies={ctx.REQUEST_PROTO: f'http://{server_address}'}, + source_address=source_address, + verify=False) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert proxy_info['client_address'][0] == source_address + + @pytest.mark.skipif(urllib3 is None, reason='requires urllib3 to test') + def test_https_connect_proxy(self, handler, ctx): + with ctx.http_server(HTTPSConnectProxyHandler) as server_address: + with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert proxy_info['connect'] is True + assert 'Proxy-Authorization' not in proxy_info['headers'] + + @pytest.mark.skipif(urllib3 is None, reason='requires urllib3 to test') + def test_https_connect_verify_failed(self, handler, ctx): + with ctx.http_server(HTTPSConnectProxyHandler) as server_address: + with handler(verify=True, proxies={ctx.REQUEST_PROTO: f'https://{server_address}'}) as rh: + # Accept SSLError as may not be feasible to tell if it is proxy or request error. + # note: if request proto also does ssl verification, this may also be the error of the request. + # Until we can support passing custom cacerts to handlers, we cannot properly test this for all cases. + with pytest.raises((ProxyError, SSLError)): + ctx.proxy_info_request(rh) + + @pytest.mark.skipif(urllib3 is None, reason='requires urllib3 to test') + def test_https_connect_proxy_auth(self, handler, ctx): + with ctx.http_server(HTTPSConnectProxyHandler, username='test', password='test') as server_address: + with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'https://test:test@{server_address}'}) as rh: + proxy_info = ctx.proxy_info_request(rh) + assert proxy_info['proxy'] == server_address + assert 'Proxy-Authorization' in proxy_info['headers'] diff --git a/test/test_networking.py b/test/test_networking.py index d613cb5681..994467014d 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -6,6 +6,8 @@ import pytest +from yt_dlp.networking.common import Features + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import gzip @@ -27,8 +29,12 @@ from email.message import Message from http.cookiejar import CookieJar -from test.conftest import validate_and_send -from test.helper import FakeYDL, http_server_port, verify_address_availability +from test.helper import ( + FakeYDL, + http_server_port, + validate_and_send, + verify_address_availability, +) from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( @@ -62,21 +68,6 @@ TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def _build_proxy_handler(name): - class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): - proxy_name = name - - def log_message(self, format, *args): - pass - - def do_GET(self): - self.send_response(200) - self.send_header('Content-Type', 'text/plain; charset=utf-8') - self.end_headers() - self.wfile.write(f'{self.proxy_name}: {self.path}'.encode()) - return HTTPTestRequestHandler - - class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' default_request_version = 'HTTP/1.1' @@ -317,8 +308,9 @@ def setup_class(cls): cls.https_server_thread.start() +@pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) + def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -329,7 +321,6 @@ def test_verify_cert(self, handler): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -347,7 +338,6 @@ def test_ssl_error(self, handler): validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -359,7 +349,6 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) @pytest.mark.parametrize('path', [ '/a/b/./../../headers', '/redirect_dotsegments', @@ -375,15 +364,13 @@ def test_remove_dot_segments(self, handler, path): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() - # Not supported by CurlCFFI (non-standard) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi (non-standard)') def test_unicode_path_redirection(self, handler): with handler() as rh: r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -393,7 +380,6 @@ def test_raise_http_error(self, handler): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -405,7 +391,6 @@ def test_response_url(self, handler): res2.close() # Covers some basic cases we expect some level of consistency between request handlers for - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) @pytest.mark.parametrize('redirect_status,method,expected', [ # A 303 must either use GET or HEAD for subsequent request (303, 'POST', ('', 'GET', False)), @@ -447,7 +432,6 @@ def test_redirect(self, handler, redirect_status, method, expected): assert expected[1] == res.headers.get('method') assert expected[2] == ('content-length' in headers.decode().lower()) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -480,19 +464,16 @@ def test_request_cookie_header(self, handler): assert b'cookie: test=ytdlp' not in data.lower() assert b'cookie: test=test3' in data.lower() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: with pytest.raises(IncompleteRead, match='13 bytes read, 234221 more expected'): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -509,7 +490,6 @@ def test_cookies(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'cookie: test=ytdlp' in data.lower() - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -525,7 +505,6 @@ def test_headers(self, handler): assert b'test2: test2' not in data assert b'test3: test3' in data - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_read_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through @@ -541,7 +520,6 @@ def test_read_timeout(self, handler): validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_connect_timeout(self, handler): # nothing should be listening on this port connect_timeout_url = 'http://10.255.255.255' @@ -560,7 +538,6 @@ def test_connect_timeout(self, handler): rh, Request(connect_timeout_url, extensions={'timeout': 0.01})) assert 0.01 <= time.time() - now < 20 - @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' # on some systems these loopback addresses we need for testing may not be available @@ -572,13 +549,13 @@ def test_source_address(self, handler): assert source_address == data # Not supported by CurlCFFI - @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) + @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi') def test_gzip_trailing_garbage(self, handler): with handler() as rh: data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() assert data == '