From a3e964211611ec60a3f84688ab9ff30e4c1504f6 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 7 Aug 2022 13:43:20 +0000 Subject: [PATCH 001/284] [extractor/youtube] Prevent redirect to unwanted videos (#4593) Example: https://www.youtube.com/watch?v=aQvGIIdgFDM Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4ad8cf9003..1b4e47b5f9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3133,7 +3133,14 @@ def append_client(*client_names): continue if pr: - prs.append(pr) + # YouTube may return a different video player response than expected. + # See: https://github.com/TeamNewPipe/NewPipe/issues/8713 + pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId')) + if pr_video_id and pr_video_id != video_id: + self.report_warning( + f'{client} client returned a player response for "{pr_video_id}" instead of "{video_id}"' + bug_reports_message()) + else: + prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: From 1f6b90ed8db7006e2f2d539c41c8f3e59058dd00 Mon Sep 17 00:00:00 2001 From: HobbyistDev Date: Sun, 7 Aug 2022 08:12:23 +0900 Subject: [PATCH 002/284] [extractor/tviplayer] Improve `_VALID_URL` (#4585) Closes #4578 Authored by: HobbyistDev --- yt_dlp/extractor/tviplayer.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/tviplayer.py b/yt_dlp/extractor/tviplayer.py index 96a27a3a9b..f60cfb050e 100644 --- a/yt_dlp/extractor/tviplayer.py +++ b/yt_dlp/extractor/tviplayer.py @@ -3,7 +3,7 @@ class TVIPlayerIE(InfoExtractor): - _VALID_URL = r'https?://tviplayer\.iol\.pt(/programa/[\w-]+/[a-f0-9]+)?/video/(?P[a-f0-9]+)' + _VALID_URL = r'https?://tviplayer\.iol\.pt(/programa/[\w-]+/[a-f0-9]+)?/\w+/(?P\w+)' _TESTS = [{ 'url': 'https://tviplayer.iol.pt/programa/jornal-das-8/53c6b3903004dc006243d0cf/video/61c8e8b90cf2c7ea0f0f71a9', 'info_dict': { @@ -27,6 +27,7 @@ class TVIPlayerIE(InfoExtractor): 'season_number': 1, } }, { + # no /programa/ 'url': 'https://tviplayer.iol.pt/video/62c4131c0cf2f9a86eac06bb', 'info_dict': { 'id': '62c4131c0cf2f9a86eac06bb', @@ -37,6 +38,18 @@ class TVIPlayerIE(InfoExtractor): 'duration': 148, 'season_number': 2, } + }, { + # episodio url + 'url': 'https://tviplayer.iol.pt/programa/para-sempre/61716c360cf2365a5ed894c4/episodio/t1e187', + 'info_dict': { + 'id': 't1e187', + 'ext': 'mp4', + 'season': 'Season 1', + 'title': 'Quem denunciou Pedro?', + 'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62eda30b0cf2ea367d48973b/', + 'duration': 1250, + 'season_number': 1, + } }] def _real_initialize(self): From 22b22b7d5c9dafa1d3f2dac25522bdd8b4091de4 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 7 Aug 2022 20:40:36 +0530 Subject: [PATCH 003/284] [extractor/WASDTV:record] Fix `_VALID_URL` --- yt_dlp/extractor/wasdtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/wasdtv.py b/yt_dlp/extractor/wasdtv.py index bf1ad65b23..bad5ccb993 100644 --- a/yt_dlp/extractor/wasdtv.py +++ b/yt_dlp/extractor/wasdtv.py @@ -95,7 +95,7 @@ def _get_media_url(self, media_meta): class WASDTVRecordIE(WASDTVBaseIE): IE_NAME = 'wasdtv:record' - _VALID_URL = r'https?://wasd\.tv/[^/#?]+/videos\?record=(?P\d+)$' + _VALID_URL = r'https?://wasd\.tv/[^/#?]+(?:/videos)?\?record=(?P\d+)$' _TESTS = [{ 'url': 'https://wasd.tv/spacemita/videos?record=907755', 'md5': 'c9899dd85be4cc997816ff9f9ca516ce', @@ -110,6 +110,9 @@ class WASDTVRecordIE(WASDTVBaseIE): 'is_live': False, 'view_count': int, }, + }, { + 'url': 'https://wasd.tv/spacemita?record=907755', + 'only_matching': True, }] def _get_container(self, url): From b8ed0f15d4a86e815da72bae9c7ef7ae106dd86b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 8 Aug 2022 01:35:36 +0530 Subject: [PATCH 004/284] [extractor] Add field `audio_channels` --- README.md | 2 ++ yt_dlp/YoutubeDL.py | 9 ++++++--- yt_dlp/extractor/common.py | 7 +++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 285c0b78a0..09ca5d876f 100644 --- a/README.md +++ b/README.md @@ -1276,6 +1276,7 @@ # OUTPUT TEMPLATE - `vbr` (numeric): Average video bitrate in KBit/s - `fps` (numeric): Frame rate - `dynamic_range` (string): The dynamic range of the video + - `audio_channels` (numeric): The number of audio channels - `stretched_ratio` (float): `width:height` of the video's pixels, if not square - `vcodec` (string): Name of the video codec in use - `container` (string): Name of the container format @@ -1529,6 +1530,7 @@ ## Sorting Formats - `res`: Video resolution, calculated as the smallest dimension. - `fps`: Framerate of video - `hdr`: The dynamic range of the video (`DV` > `HDR12` > `HDR10+` > `HDR10` > `HLG` > `SDR`) + - `channels`: The number of audio channels - `tbr`: Total average bitrate in KBit/s - `vbr`: Average video bitrate in KBit/s - `abr`: Average audio bitrate in KBit/s diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 25473611ba..ded34b8edc 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -527,7 +527,8 @@ class YoutubeDL: """ _NUMERIC_FIELDS = { - 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + 'width', 'height', 'asr', 'audio_channels', 'fps', + 'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx', 'timestamp', 'release_timestamp', 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', 'average_rating', 'comment_count', 'age_limit', @@ -539,7 +540,7 @@ class YoutubeDL: _format_fields = { # NB: Keep in sync with the docstring of extractor/common.py 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', - 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', + 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'preference', 'language', 'language_preference', 'quality', 'source_preference', @@ -2129,6 +2130,7 @@ def _merge(formats_pair): 'acodec': the_only_audio.get('acodec'), 'abr': the_only_audio.get('abr'), 'asr': the_only_audio.get('asr'), + 'audio_channels': the_only_audio.get('audio_channels') }) return new_dict @@ -3569,6 +3571,7 @@ def simplified_codec(f, field): format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), format_field(f, 'fps', '\t%d', func=round), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), + format_field(f, 'audio_channels', '\t%s'), delim, format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), format_field(f, 'tbr', '\t%dk', func=round), @@ -3588,7 +3591,7 @@ def simplified_codec(f, field): delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = self._list_format_headers( - 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO', + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO', delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') return render_table( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index bf3fc8258f..8afbc76d16 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -154,6 +154,7 @@ class InfoExtractor: * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz + * audio_channels Number of audio channels * vbr Average video bitrate in KBit/s * fps Frame rate * vcodec Name of the video codec in use @@ -1668,7 +1669,7 @@ class FormatSort: regex = r' *((?P\+)?(?P[a-zA-Z0-9_]+)((?P[~:])(?P.*?))?)? *$' default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', + 'res', 'fps', 'hdr:12', 'channels', 'codec:vp9.2', 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', @@ -1704,6 +1705,7 @@ class FormatSort: 'height': {'convert': 'float_none'}, 'width': {'convert': 'float_none'}, 'fps': {'convert': 'float_none'}, + 'channels': {'convert': 'float_none', 'field': 'audio_channels'}, 'tbr': {'convert': 'float_none'}, 'vbr': {'convert': 'float_none'}, 'abr': {'convert': 'float_none'}, @@ -1717,13 +1719,14 @@ class FormatSort: 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - # For compatibility with youtube-dl + # Actual field names 'format_id': {'type': 'alias', 'field': 'id'}, 'preference': {'type': 'alias', 'field': 'ie_pref'}, 'language_preference': {'type': 'alias', 'field': 'lang'}, 'source_preference': {'type': 'alias', 'field': 'source'}, 'protocol': {'type': 'alias', 'field': 'proto'}, 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, + 'audio_channels': {'type': 'alias', 'field': 'channels'}, # Deprecated 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, From a41662343603bc2d32648ebf0779e5fe1e18d263 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 8 Aug 2022 01:36:11 +0530 Subject: [PATCH 005/284] [extractor/youtube] Extract more format info --- yt_dlp/extractor/youtube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1b4e47b5f9..325aa0a230 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2254,6 +2254,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': [], 'uploader_url': 'http://www.youtube.com/user/nao20010128nao', } + }, { + 'note': '6 channel audio', + 'url': 'https://www.youtube.com/watch?v=zgdo7-RRjgo', + 'only_matching': True, } ] @@ -3253,10 +3257,13 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i '%s%s' % (audio_track.get('displayName') or '', ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), + try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': -10 if throttled else -5 if itag == '22' else -1, 'fps': int_or_none(fmt.get('fps')) or None, + 'audio_channels': fmt.get('audioChannels'), 'height': height, 'quality': q(quality), 'has_drm': bool(fmt.get('drmFamilies')), @@ -3577,7 +3584,7 @@ def feed_entry(name): formats.extend(self._extract_storyboard(player_responses, duration)) # source_preference is lower for throttled/potentially damaged formats - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto')) + self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'channels', 'source', 'codec:vp9.2', 'lang', 'proto')) info = { 'id': video_id, From 298d9c0e891b1a0fbc3ec6d3674ff6fbc550d6ec Mon Sep 17 00:00:00 2001 From: Djeson <61365937+DjesonPV@users.noreply.github.com> Date: Sun, 7 Aug 2022 22:21:53 +0200 Subject: [PATCH 006/284] [extractor/ninegag] Extract uploader (#4597) Closes #4587 Authored by: DjesonPV --- yt_dlp/extractor/ninegag.py | 45 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/ninegag.py b/yt_dlp/extractor/ninegag.py index 00ca95ea2e..86e710f2b1 100644 --- a/yt_dlp/extractor/ninegag.py +++ b/yt_dlp/extractor/ninegag.py @@ -3,7 +3,7 @@ ExtractorError, determine_ext, int_or_none, - try_get, + traverse_obj, unescapeHTML, url_or_none, ) @@ -11,18 +11,20 @@ class NineGagIE(InfoExtractor): IE_NAME = '9gag' + IE_DESC = '9GAG' _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Capybara Agility Training', 'upload_date': '20191108', 'timestamp': 1573237208, + 'thumbnail': 'https://img-9gag-fun.9cache.com/photo/ae5Ag7B_460s.jpg', 'categories': ['Awesome'], - 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'tags': ['Awesome'], 'duration': 44, 'like_count': int, 'dislike_count': int, @@ -32,6 +34,26 @@ class NineGagIE(InfoExtractor): # HTML escaped title 'url': 'https://9gag.com/gag/av5nvyb', 'only_matching': True, + }, { + # Non Anonymous Uploader + 'url': 'https://9gag.com/gag/ajgp66G', + 'info_dict': { + 'id': 'ajgp66G', + 'ext': 'webm', + 'title': 'Master Shifu! Or Splinter! You decide:', + 'upload_date': '20220806', + 'timestamp': 1659803411, + 'thumbnail': 'https://img-9gag-fun.9cache.com/photo/ajgp66G_460s.jpg', + 'categories': ['Funny'], + 'tags': ['Funny'], + 'duration': 26, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'uploader': 'Peter Klaus', + 'uploader_id': 'peterklaus12', + 'uploader_url': 'https://9gag.com/u/peterklaus12', + } }] def _real_extract(self, url): @@ -46,8 +68,6 @@ def _real_extract(self, url): 'The given url does not contain a video', expected=True) - title = unescapeHTML(post['title']) - duration = None formats = [] thumbnails = [] @@ -98,7 +118,7 @@ def _real_extract(self, url): formats.append(common) self._sort_formats(formats) - section = try_get(post, lambda x: x['postSection']['name']) + section = traverse_obj(post, ('postSection', 'name')) tags = None post_tags = post.get('tags') @@ -110,18 +130,19 @@ def _real_extract(self, url): continue tags.append(tag_key) - get_count = lambda x: int_or_none(post.get(x + 'Count')) - return { 'id': post_id, - 'title': title, + 'title': unescapeHTML(post.get('title')), 'timestamp': int_or_none(post.get('creationTs')), 'duration': duration, + 'uploader': traverse_obj(post, ('creator', 'fullName')), + 'uploader_id': traverse_obj(post, ('creator', 'username')), + 'uploader_url': url_or_none(traverse_obj(post, ('creator', 'profileUrl'))), 'formats': formats, 'thumbnails': thumbnails, - 'like_count': get_count('upVote'), - 'dislike_count': get_count('downVote'), - 'comment_count': get_count('comments'), + 'like_count': int_or_none(post.get('upVoteCount')), + 'dislike_count': int_or_none(post.get('downVoteCount')), + 'comment_count': int_or_none(post.get('commentsCount')), 'age_limit': 18 if post.get('nsfw') == 1 else None, 'categories': [section] if section else None, 'tags': tags, From c7dcf0b31e57bb98472da7cf293f523caa81c4a7 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Mon, 8 Aug 2022 12:01:57 +1200 Subject: [PATCH 007/284] [extractor/youtube] Add `androidSdkVersion` parameter to Android Innertube clients Required to prevent YouTube returning a bad player response in some cases. See: https://github.com/yt-dlp/yt-dlp/pull/4593, https://github.com/TeamNewPipe/NewPipe/issues/8713, https://github.com/iv-org/invidious/issues/3230, https://github.com/Tyrrrz/YoutubeExplode/issues/647 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 325aa0a230..fc8825b190 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -109,7 +109,8 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '17.28.34', + 'clientVersion': '17.29.34', + 'androidSdkVersion': 30 } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, @@ -120,7 +121,8 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '17.28.34', + 'clientVersion': '17.29.34', + 'androidSdkVersion': 30 }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, @@ -132,6 +134,7 @@ 'client': { 'clientName': 'ANDROID_MUSIC', 'clientVersion': '5.16.51', + 'androidSdkVersion': 30 } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, @@ -143,6 +146,7 @@ 'client': { 'clientName': 'ANDROID_CREATOR', 'clientVersion': '22.28.100', + 'androidSdkVersion': 30 }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, @@ -3142,7 +3146,7 @@ def append_client(*client_names): pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId')) if pr_video_id and pr_video_id != video_id: self.report_warning( - f'{client} client returned a player response for "{pr_video_id}" instead of "{video_id}"' + bug_reports_message()) + f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message()) else: prs.append(pr) From c4b6c5c7c9eb0aa448d03c1540580cdd92737aa8 Mon Sep 17 00:00:00 2001 From: shirt Date: Mon, 8 Aug 2022 15:24:30 -0400 Subject: [PATCH 008/284] [build] Improve build process (#4513) Authored by: shirt-dev --- .github/workflows/build.yml | 387 ++++++++++--------------------- .github/workflows/core.yml | 4 +- .github/workflows/download.yml | 8 +- .github/workflows/quick-test.yml | 8 +- 4 files changed, 127 insertions(+), 280 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4c87f38ebd..f3cc9930d5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,18 +2,17 @@ name: Build on: workflow_dispatch jobs: - create_release: + prepare: runs-on: ubuntu-latest outputs: version_suffix: ${{ steps.version_suffix.outputs.version_suffix }} ytdlp_version: ${{ steps.bump_version.outputs.ytdlp_version }} - upload_url: ${{ steps.create_release.outputs.upload_url }} - release_id: ${{ steps.create_release.outputs.id }} + head_sha: ${{ steps.push_release.outputs.head_sha }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: '3.10' @@ -43,53 +42,15 @@ jobs: PUSH_VERSION_COMMIT: ${{ secrets.PUSH_VERSION_COMMIT }} if: "env.PUSH_VERSION_COMMIT != ''" run: git push origin ${{ github.event.ref }} - - name: Get Changelog - run: | - changelog=$(grep -oPz '(?s)(?<=### ${{ steps.bump_version.outputs.ytdlp_version }}\n{2}).+?(?=\n{2,3}###)' Changelog.md) || true - echo "changelog<> $GITHUB_ENV - echo "$changelog" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - - name: Create Release - id: create_release - uses: actions/create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ steps.bump_version.outputs.ytdlp_version }} - release_name: yt-dlp ${{ steps.bump_version.outputs.ytdlp_version }} - commitish: ${{ steps.push_release.outputs.head_sha }} - draft: true - prerelease: false - body: | - #### [A description of the various files]((https://github.com/yt-dlp/yt-dlp#release-files)) are in the README - - --- -

Changelog

-

- - ${{ env.changelog }} - -

-
build_unix: - needs: create_release + needs: prepare runs-on: ubuntu-18.04 # Standalone executable should be built on minimum supported OS - outputs: - sha256_bin: ${{ steps.get_sha.outputs.sha256_bin }} - sha512_bin: ${{ steps.get_sha.outputs.sha512_bin }} - sha256_tar: ${{ steps.get_sha.outputs.sha256_tar }} - sha512_tar: ${{ steps.get_sha.outputs.sha512_tar }} - sha256_linux: ${{ steps.get_sha.outputs.sha256_linux }} - sha512_linux: ${{ steps.get_sha.outputs.sha512_linux }} - sha256_linux_zip: ${{ steps.get_sha.outputs.sha256_linux_zip }} - sha512_linux_zip: ${{ steps.get_sha.outputs.sha512_linux_zip }} steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: python-version: '3.10' - name: Install Requirements @@ -100,7 +61,7 @@ jobs: - name: Prepare run: | - python devscripts/update-version.py ${{ needs.create_release.outputs.version_suffix }} + python devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} python devscripts/make_lazy_extractors.py - name: Build Unix executables run: | @@ -111,51 +72,15 @@ jobs: - name: Get SHA2-SUMS id: get_sha run: | - echo "::set-output name=sha256_bin::$(sha256sum yt-dlp | awk '{print $1}')" - echo "::set-output name=sha512_bin::$(sha512sum yt-dlp | awk '{print $1}')" - echo "::set-output name=sha256_tar::$(sha256sum yt-dlp.tar.gz | awk '{print $1}')" - echo "::set-output name=sha512_tar::$(sha512sum yt-dlp.tar.gz | awk '{print $1}')" - echo "::set-output name=sha256_linux::$(sha256sum dist/yt-dlp_linux | awk '{print $1}')" - echo "::set-output name=sha512_linux::$(sha512sum dist/yt-dlp_linux | awk '{print $1}')" - echo "::set-output name=sha256_linux_zip::$(sha256sum dist/yt-dlp_linux.zip | awk '{print $1}')" - echo "::set-output name=sha512_linux_zip::$(sha512sum dist/yt-dlp_linux.zip | awk '{print $1}')" - - name: Upload zip binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload artifacts + uses: actions/upload-artifact@v3 with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./yt-dlp - asset_name: yt-dlp - asset_content_type: application/octet-stream - - name: Upload Source tar - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./yt-dlp.tar.gz - asset_name: yt-dlp.tar.gz - asset_content_type: application/gzip - - name: Upload standalone binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_linux - asset_name: yt-dlp_linux - asset_content_type: application/octet-stream - - name: Upload onedir binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_linux.zip - asset_name: yt-dlp_linux.zip - asset_content_type: application/zip + path: | + yt-dlp + yt-dlp.tar.gz + dist/yt-dlp_linux + dist/yt-dlp_linux.zip - name: Build and publish on PyPi env: @@ -180,24 +105,19 @@ jobs: if: "env.BREW_TOKEN != ''" run: | git clone git@github.com:yt-dlp/homebrew-taps taps/ - python devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ needs.create_release.outputs.ytdlp_version }}" + python devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ needs.prepare.outputs.ytdlp_version }}" git -C taps/ config user.name github-actions git -C taps/ config user.email github-actions@example.com - git -C taps/ commit -am 'yt-dlp: ${{ needs.create_release.outputs.ytdlp_version }}' + git -C taps/ commit -am 'yt-dlp: ${{ needs.prepare.outputs.ytdlp_version }}' git -C taps/ push build_macos: runs-on: macos-11 - needs: create_release - outputs: - sha256_macos: ${{ steps.get_sha.outputs.sha256_macos }} - sha512_macos: ${{ steps.get_sha.outputs.sha512_macos }} - sha256_macos_zip: ${{ steps.get_sha.outputs.sha256_macos_zip }} - sha512_macos_zip: ${{ steps.get_sha.outputs.sha512_macos_zip }} + needs: prepare steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 # NB: In order to create a universal2 application, the version of python3 in /usr/bin has to be used - name: Install Requirements run: | @@ -206,50 +126,28 @@ jobs: - name: Prepare run: | - /usr/bin/python3 devscripts/update-version.py ${{ needs.create_release.outputs.version_suffix }} + /usr/bin/python3 devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} /usr/bin/python3 devscripts/make_lazy_extractors.py - name: Build run: | /usr/bin/python3 pyinst.py --target-architecture universal2 --onedir (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) /usr/bin/python3 pyinst.py --target-architecture universal2 - - name: Get SHA2-SUMS - id: get_sha - run: | - echo "::set-output name=sha256_macos::$(sha256sum dist/yt-dlp_macos | awk '{print $1}')" - echo "::set-output name=sha512_macos::$(sha512sum dist/yt-dlp_macos | awk '{print $1}')" - echo "::set-output name=sha256_macos_zip::$(sha256sum dist/yt-dlp_macos.zip | awk '{print $1}')" - echo "::set-output name=sha512_macos_zip::$(sha512sum dist/yt-dlp_macos.zip | awk '{print $1}')" - - name: Upload standalone binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload artifacts + uses: actions/upload-artifact@v3 with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_macos - asset_name: yt-dlp_macos - asset_content_type: application/octet-stream - - name: Upload onedir binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_macos.zip - asset_name: yt-dlp_macos.zip - asset_content_type: application/zip + path: | + dist/yt-dlp_macos + dist/yt-dlp_macos.zip build_macos_legacy: runs-on: macos-latest - needs: create_release - outputs: - sha256_macos_legacy: ${{ steps.get_sha.outputs.sha256_macos_legacy }} - sha512_macos_legacy: ${{ steps.get_sha.outputs.sha512_macos_legacy }} + needs: prepare steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install Python # We need the official Python, because the GA ones only support newer macOS versions env: @@ -269,42 +167,27 @@ jobs: - name: Prepare run: | - python3 devscripts/update-version.py ${{ needs.create_release.outputs.version_suffix }} + python3 devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} python3 devscripts/make_lazy_extractors.py - name: Build run: | python3 pyinst.py - - name: Get SHA2-SUMS - id: get_sha - run: | - echo "::set-output name=sha256_macos_legacy::$(sha256sum dist/yt-dlp_macos | awk '{print $1}')" - echo "::set-output name=sha512_macos_legacy::$(sha512sum dist/yt-dlp_macos | awk '{print $1}')" + mv dist/yt-dlp_macos dist/yt-dlp_macos_legacy - - name: Upload standalone binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload artifacts + uses: actions/upload-artifact@v3 with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_macos - asset_name: yt-dlp_macos_legacy - asset_content_type: application/octet-stream + path: | + dist/yt-dlp_macos_legacy build_windows: runs-on: windows-latest - needs: create_release - outputs: - sha256_win: ${{ steps.get_sha.outputs.sha256_win }} - sha512_win: ${{ steps.get_sha.outputs.sha512_win }} - sha256_py2exe: ${{ steps.get_sha.outputs.sha256_py2exe }} - sha512_py2exe: ${{ steps.get_sha.outputs.sha512_py2exe }} - sha256_win_zip: ${{ steps.get_sha.outputs.sha256_win_zip }} - sha512_win_zip: ${{ steps.get_sha.outputs.sha512_win_zip }} + needs: prepare steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: # 3.8 is used for Win7 support python-version: '3.8' - name: Install Requirements @@ -314,7 +197,7 @@ jobs: - name: Prepare run: | - python devscripts/update-version.py ${{ needs.create_release.outputs.version_suffix }} + python devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} python devscripts/make_lazy_extractors.py - name: Build run: | @@ -323,55 +206,23 @@ jobs: python pyinst.py python pyinst.py --onedir Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip - - name: Get SHA2-SUMS - id: get_sha - run: | - echo "::set-output name=sha256_py2exe::$((Get-FileHash dist\yt-dlp_min.exe -Algorithm SHA256).Hash.ToLower())" - echo "::set-output name=sha512_py2exe::$((Get-FileHash dist\yt-dlp_min.exe -Algorithm SHA512).Hash.ToLower())" - echo "::set-output name=sha256_win::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA256).Hash.ToLower())" - echo "::set-output name=sha512_win::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA512).Hash.ToLower())" - echo "::set-output name=sha256_win_zip::$((Get-FileHash dist\yt-dlp_win.zip -Algorithm SHA256).Hash.ToLower())" - echo "::set-output name=sha512_win_zip::$((Get-FileHash dist\yt-dlp_win.zip -Algorithm SHA512).Hash.ToLower())" - - name: Upload py2exe binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload artifacts + uses: actions/upload-artifact@v3 with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_min.exe - asset_name: yt-dlp_min.exe - asset_content_type: application/vnd.microsoft.portable-executable - - name: Upload standalone binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp.exe - asset_name: yt-dlp.exe - asset_content_type: application/vnd.microsoft.portable-executable - - name: Upload onedir binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_win.zip - asset_name: yt-dlp_win.zip - asset_content_type: application/zip + path: | + dist/yt-dlp.exe + dist/yt-dlp_min.exe + dist/yt-dlp_win.zip build_windows32: runs-on: windows-latest - needs: create_release - outputs: - sha256_win32: ${{ steps.get_sha.outputs.sha256_win32 }} - sha512_win32: ${{ steps.get_sha.outputs.sha512_win32 }} + needs: prepare steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: # 3.7 is used for Vista support. See https://github.com/yt-dlp/yt-dlp/issues/390 python-version: '3.7' architecture: 'x86' @@ -382,95 +233,91 @@ jobs: - name: Prepare run: | - python devscripts/update-version.py ${{ needs.create_release.outputs.version_suffix }} + python devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} python devscripts/make_lazy_extractors.py - name: Build run: | python pyinst.py - - name: Get SHA2-SUMS - id: get_sha - run: | - echo "::set-output name=sha256_win32::$((Get-FileHash dist\yt-dlp_x86.exe -Algorithm SHA256).Hash.ToLower())" - echo "::set-output name=sha512_win32::$((Get-FileHash dist\yt-dlp_x86.exe -Algorithm SHA512).Hash.ToLower())" - - name: Upload standalone binary - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload artifacts + uses: actions/upload-artifact@v3 with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./dist/yt-dlp_x86.exe - asset_name: yt-dlp_x86.exe - asset_content_type: application/vnd.microsoft.portable-executable + path: | + dist/yt-dlp_x86.exe - finish: + publish_release: runs-on: ubuntu-latest - needs: [create_release, build_unix, build_windows, build_windows32, build_macos, build_macos_legacy] + needs: [prepare, build_unix, build_windows, build_windows32, build_macos, build_macos_legacy] steps: - - name: Make SHA2-SUMS files + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + + - name: Get Changelog run: | - echo "${{ needs.build_unix.outputs.sha256_bin }} yt-dlp" >> SHA2-256SUMS - echo "${{ needs.build_unix.outputs.sha256_tar }} yt-dlp.tar.gz" >> SHA2-256SUMS - echo "${{ needs.build_unix.outputs.sha256_linux }} yt-dlp_linux" >> SHA2-256SUMS - echo "${{ needs.build_unix.outputs.sha256_linux_zip }} yt-dlp_linux.zip" >> SHA2-256SUMS - echo "${{ needs.build_windows.outputs.sha256_win }} yt-dlp.exe" >> SHA2-256SUMS - echo "${{ needs.build_windows.outputs.sha256_py2exe }} yt-dlp_min.exe" >> SHA2-256SUMS - echo "${{ needs.build_windows32.outputs.sha256_win32 }} yt-dlp_x86.exe" >> SHA2-256SUMS - echo "${{ needs.build_windows.outputs.sha256_win_zip }} yt-dlp_win.zip" >> SHA2-256SUMS - echo "${{ needs.build_macos.outputs.sha256_macos }} yt-dlp_macos" >> SHA2-256SUMS - echo "${{ needs.build_macos.outputs.sha256_macos_zip }} yt-dlp_macos.zip" >> SHA2-256SUMS - echo "${{ needs.build_macos_legacy.outputs.sha256_macos_legacy }} yt-dlp_macos_legacy" >> SHA2-256SUMS - echo "${{ needs.build_unix.outputs.sha512_bin }} yt-dlp" >> SHA2-512SUMS - echo "${{ needs.build_unix.outputs.sha512_tar }} yt-dlp.tar.gz" >> SHA2-512SUMS - echo "${{ needs.build_unix.outputs.sha512_linux }} yt-dlp_linux" >> SHA2-512SUMS - echo "${{ needs.build_unix.outputs.sha512_linux_zip }} yt-dlp_linux.zip" >> SHA2-512SUMS - echo "${{ needs.build_windows.outputs.sha512_win }} yt-dlp.exe" >> SHA2-512SUMS - echo "${{ needs.build_windows.outputs.sha512_py2exe }} yt-dlp_min.exe" >> SHA2-512SUMS - echo "${{ needs.build_windows32.outputs.sha512_win32 }} yt-dlp_x86.exe" >> SHA2-512SUMS - echo "${{ needs.build_windows.outputs.sha512_win_zip }} yt-dlp_win.zip" >> SHA2-512SUMS - echo "${{ needs.build_macos.outputs.sha512_macos }} yt-dlp_macos" >> SHA2-512SUMS - echo "${{ needs.build_macos.outputs.sha512_macos_zip }} yt-dlp_macos.zip" >> SHA2-512SUMS - echo "${{ needs.build_macos_legacy.outputs.sha512_macos_legacy }} yt-dlp_macos_legacy" >> SHA2-512SUMS - - - name: Upload SHA2-256SUMS file - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./SHA2-256SUMS - asset_name: SHA2-256SUMS - asset_content_type: text/plain - - name: Upload SHA2-512SUMS file - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./SHA2-512SUMS - asset_name: SHA2-512SUMS - asset_content_type: text/plain - + changelog=$(grep -oPz '(?s)(?<=### ${{ steps.bump_version.outputs.ytdlp_version }}\n{2}).+?(?=\n{2,3}###)' Changelog.md) || true + echo "changelog<> $GITHUB_ENV + echo "$changelog" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV - name: Make Update spec run: | echo "# This file is used for regulating self-update" >> _update_spec echo "lock 2022.07.18 .+ Python 3.6" >> _update_spec - - name: Upload update spec - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create_release.outputs.upload_url }} - asset_path: ./_update_spec - asset_name: _update_spec - asset_content_type: text/plain - - - name: Finalize release - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Make SHA2-SUMS files run: | - gh api -X PATCH -H "Accept: application/vnd.github.v3+json" \ - /repos/${{ github.repository }}/releases/${{ needs.create_release.outputs.release_id }} \ - -F draft=false + sha256sum artifact/yt-dlp | awk '{print $1 " yt-dlp"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp.tar.gz | awk '{print $1 " yt-dlp.tar.gz"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp.exe | awk '{print $1 " yt-dlp.exe"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp_win.zip | awk '{print $1 " yt-dlp_win.zip"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp_min.exe | awk '{print $1 " yt-dlp_min.exe"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp_x86.exe | awk '{print $1 " yt-dlp_x86.exe"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp_macos | awk '{print $1 " yt-dlp_macos"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp_macos.zip | awk '{print $1 " yt-dlp_macos.zip"}' >> SHA2-256SUMS + sha256sum artifact/yt-dlp_macos_legacy | awk '{print $1 " yt-dlp_macos_legacy"}' >> SHA2-256SUMS + sha256sum artifact/dist/yt-dlp_linux | awk '{print $1 " yt-dlp_linux"}' >> SHA2-256SUMS + sha256sum artifact/dist/yt-dlp_linux.zip | awk '{print $1 " yt-dlp_linux.zip"}' >> SHA2-256SUMS + sha512sum artifact/yt-dlp | awk '{print $1 " yt-dlp"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp.tar.gz | awk '{print $1 " yt-dlp.tar.gz"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp.exe | awk '{print $1 " yt-dlp.exe"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp_win.zip | awk '{print $1 " yt-dlp_win.zip"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp_min.exe | awk '{print $1 " yt-dlp_min.exe"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp_x86.exe | awk '{print $1 " yt-dlp_x86.exe"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp_macos | awk '{print $1 " yt-dlp_macos"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp_macos.zip | awk '{print $1 " yt-dlp_macos.zip"}' >> SHA2-512SUMS + sha512sum artifact/yt-dlp_macos_legacy | awk '{print $1 " yt-dlp_macos_legacy"}' >> SHA2-512SUMS + sha512sum artifact/dist/yt-dlp_linux | awk '{print $1 " yt-dlp_linux"}' >> SHA2-512SUMS + sha512sum artifact/dist/yt-dlp_linux.zip | awk '{print $1 " yt-dlp_linux.zip"}' >> SHA2-512SUMS + + - name: Publish Release + uses: yt-dlp/action-gh-release@v1 + with: + tag_name: ${{ needs.prepare.outputs.ytdlp_version }} + name: yt-dlp ${{ needs.prepare.outputs.ytdlp_version }} + target_commitish: ${{ needs.prepare.outputs.head_sha }} + body: | + #### [A description of the various files]((https://github.com/yt-dlp/yt-dlp#release-files)) are in the README + + --- +

Changelog

+

+ + ${{ env.changelog }} + +

+
+ files: | + SHA2-256SUMS + SHA2-512SUMS + artifact/yt-dlp + artifact/yt-dlp.tar.gz + artifact/yt-dlp.exe + artifact/yt-dlp_win.zip + artifact/yt-dlp_min.exe + artifact/yt-dlp_x86.exe + artifact/yt-dlp_macos + artifact/yt-dlp_macos.zip + artifact/yt-dlp_macos_legacy + artifact/dist/yt-dlp_linux + artifact/dist/yt-dlp_linux.zip + _update_spec diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index a60e002d9e..d0e890b30e 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -21,9 +21,9 @@ jobs: python-version: pypy-3.9 run-tests-ext: bat steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install pytest diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index e8eb1fd12e..cc2da62fae 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -6,9 +6,9 @@ jobs: if: "contains(github.event.head_commit.message, 'ci run dl')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install test requirements @@ -36,9 +36,9 @@ jobs: python-version: pypy-3.9 run-tests-ext: bat steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install pytest diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index d8e14f4705..53b74e2c75 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -6,9 +6,9 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install test requirements @@ -20,9 +20,9 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install flake8 From 115add43876964956917bf596c1d0b148c5b3c26 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 9 Aug 2022 01:08:47 +0530 Subject: [PATCH 009/284] [devscripts] Create `utils` and refactor --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 7 ++++ .../ISSUE_TEMPLATE/2_site_support_request.yml | 7 ++++ .../ISSUE_TEMPLATE/3_site_feature_request.yml | 7 ++++ .github/ISSUE_TEMPLATE/4_bug_report.yml | 7 ++++ .github/ISSUE_TEMPLATE/5_feature_request.yml | 7 ++++ .github/ISSUE_TEMPLATE/6_question.yml | 9 +++- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml | 1 + .../2_site_support_request.yml | 1 + .../3_site_feature_request.yml | 1 + .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml | 1 + .../ISSUE_TEMPLATE_tmpl/5_feature_request.yml | 1 + .github/ISSUE_TEMPLATE_tmpl/6_question.yml | 3 +- .github/PULL_REQUEST_TEMPLATE.md | 2 + README.md | 2 +- devscripts/make_issue_template.py | 40 +++++++++--------- devscripts/make_lazy_extractors.py | 16 +++----- devscripts/make_readme.py | 23 +++++++---- devscripts/make_supportedsites.py | 12 +----- devscripts/prepare_manpage.py | 41 ++++++++++--------- devscripts/update-formulae.py | 14 ++++--- devscripts/update-version.py | 41 ++++++++++--------- devscripts/utils.py | 35 ++++++++++++++++ pyinst.py | 18 ++++---- setup.py | 20 +++------ 24 files changed, 191 insertions(+), 125 deletions(-) create mode 100644 devscripts/utils.py diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 7117039ed7..611e232b56 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -2,6 +2,13 @@ name: Broken site description: Report broken or misfunctioning site labels: [triage, site-bug] body: + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + required: true - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ffe8f32f0d..ace41816b6 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -2,6 +2,13 @@ name: Site support request description: Request support for a new site labels: [triage, site-request] body: + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + required: true - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 11bd109a6f..24fbfee931 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -2,6 +2,13 @@ name: Site feature request description: Request a new functionality for a supported site labels: [triage, site-enhancement] body: + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + required: true - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 412bb9757c..f10339cd81 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -2,6 +2,13 @@ name: Bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + required: true - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index c41ea85335..464a3e23a5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -2,6 +2,13 @@ name: Feature request description: Request a new functionality unrelated to any particular site or extractor labels: [triage, enhancement] body: + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + required: true - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index edfa4c7a0d..0498e9af1b 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -2,12 +2,19 @@ name: Ask question description: Ask yt-dlp related question labels: [question] body: + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + required: true - type: markdown attributes: value: | ### Make sure you are **only** asking a question and not reporting a bug or requesting a feature. If your question contains "isn't working" or "can you add", this is most likely the wrong template. - If you are in doubt whether this is the right template, **use another template**! + If you are in doubt whether this is the right template, **USE ANOTHER TEMPLATE**! - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index 35fae2be61..16efba5793 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -2,6 +2,7 @@ name: Broken site description: Report broken or misfunctioning site labels: [triage, site-bug] body: + %(no_skip)s - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index 02125f77df..522eb751eb 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -2,6 +2,7 @@ name: Site support request description: Request support for a new site labels: [triage, site-request] body: + %(no_skip)s - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml index 154d4e35f6..2b46650f70 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -2,6 +2,7 @@ name: Site feature request description: Request a new functionality for a supported site labels: [triage, site-enhancement] body: + %(no_skip)s - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 650ef208e8..fd966e8ca3 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -2,6 +2,7 @@ name: Bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: + %(no_skip)s - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml index 6c0ecf386d..8bbc5d733f 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -2,6 +2,7 @@ name: Feature request description: Request a new functionality unrelated to any particular site or extractor labels: [triage, enhancement] body: + %(no_skip)s - type: checkboxes id: checklist attributes: diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml index 1df4d41db9..ee09e82a38 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -2,12 +2,13 @@ name: Ask question description: Ask yt-dlp related question labels: [question] body: + %(no_skip)s - type: markdown attributes: value: | ### Make sure you are **only** asking a question and not reporting a bug or requesting a feature. If your question contains "isn't working" or "can you add", this is most likely the wrong template. - If you are in doubt whether this is the right template, **use another template**! + If you are in doubt whether this is the right template, **USE ANOTHER TEMPLATE**! - type: checkboxes id: checklist attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ec95903d65..5abc6ce41e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,5 @@ +**IMPORTANT**: PRs without the template will be CLOSED + ### Description of your *pull request* and other information diff --git a/README.md b/README.md index 09ca5d876f..0a6dd53d73 100644 --- a/README.md +++ b/README.md @@ -312,7 +312,7 @@ #### Deprecated ## COMPILE ### Standalone PyInstaller Builds -To build the Windows/MacOS executable, you must have Python and `pyinstaller` (plus any of yt-dlp's [optional dependencies](#dependencies) if needed). Once you have all the necessary dependencies installed, simply run `pyinst.py`. The executable will be built for the same architecture (32/64 bit) as the Python used. +To build the standalone executable, you must have Python and `pyinstaller` (plus any of yt-dlp's [optional dependencies](#dependencies) if needed). Once you have all the necessary dependencies installed, simply run `pyinst.py`. The executable will be built for the same architecture (x86/ARM, 32/64 bit) as the Python used. python3 -m pip install -U pyinstaller -r requirements.txt python3 devscripts/make_lazy_extractors.py diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 90e7e0b43e..fd964c6c65 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -7,20 +7,14 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import optparse import re - -def read(fname): - with open(fname, encoding='utf-8') as f: - return f.read() - - -# Get the version without importing the package -def read_version(fname): - exec(compile(read(fname), fname, 'exec')) - return locals()['__version__'] - +from devscripts.utils import ( + get_filename_args, + read_file, + read_version, + write_file, +) VERBOSE_TMPL = ''' - type: checkboxes @@ -58,20 +52,24 @@ def read_version(fname): required: true '''.strip() +NO_SKIP = ''' + - type: checkboxes + attributes: + label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE + description: Fill all fields even if you think it is irrelevant for the issue + options: + - label: I understand that I will be **blocked** if I remove or skip any mandatory\\* field + required: true +'''.strip() + def main(): - parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') - _, args = parser.parse_args() - if len(args) != 2: - parser.error('Expected an input and an output filename') - - fields = {'version': read_version('yt_dlp/version.py')} + fields = {'version': read_version(), 'no_skip': NO_SKIP} fields['verbose'] = VERBOSE_TMPL % fields fields['verbose_optional'] = re.sub(r'(\n\s+validations:)?\n\s+required: true', '', fields['verbose']) - infile, outfile = args - with open(outfile, 'w', encoding='utf-8') as outf: - outf.write(read(infile) % fields) + infile, outfile = get_filename_args(has_infile=True) + write_file(outfile, read_file(infile) % fields) if __name__ == '__main__': diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index c9fdfb5623..01bd88ae61 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -7,9 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import optparse from inspect import getsource +from devscripts.utils import get_filename_args, read_file, write_file + NO_ATTR = object() STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit'] CLASS_METHODS = [ @@ -19,17 +20,11 @@ class {name}({bases}): _module = {module!r} ''' -with open('devscripts/lazy_load_template.py', encoding='utf-8') as f: - MODULE_TEMPLATE = f.read() +MODULE_TEMPLATE = read_file('devscripts/lazy_load_template.py') def main(): - parser = optparse.OptionParser(usage='%prog [OUTFILE.py]') - args = parser.parse_args()[1] or ['yt_dlp/extractor/lazy_extractors.py'] - if len(args) != 1: - parser.error('Expected only an output filename') - - lazy_extractors_filename = args[0] + lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) @@ -46,8 +41,7 @@ def main(): *build_ies(_ALL_CLASSES, (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor), )) - with open(lazy_extractors_filename, 'wt', encoding='utf-8') as f: - f.write(f'{module_src}\n') + write_file(lazy_extractors_filename, f'{module_src}\n') def get_all_ies(): diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index f2e08d7c6e..767ea5409f 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -5,10 +5,17 @@ This must be run in a console of correct width """ +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import functools import re -import sys + +from devscripts.utils import read_file, write_file README_FILE = 'README.md' @@ -63,12 +70,10 @@ def apply_patch(text, patch): ), ) -with open(README_FILE, encoding='utf-8') as f: - readme = f.read() +readme = read_file(README_FILE) -with open(README_FILE, 'w', encoding='utf-8') as f: - f.write(''.join(( - take_section(readme, end=f'## {OPTIONS_START}'), - functools.reduce(apply_patch, PATCHES, options), - take_section(readme, f'# {OPTIONS_END}'), - ))) +write_file(README_FILE, ''.join(( + take_section(readme, end=f'## {OPTIONS_START}'), + functools.reduce(apply_patch, PATCHES, options), + take_section(readme, f'# {OPTIONS_END}'), +))) diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index e46f7af565..01548ef97a 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -7,21 +7,13 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import optparse - +from devscripts.utils import get_filename_args, write_file from yt_dlp.extractor import list_extractor_classes def main(): - parser = optparse.OptionParser(usage='%prog OUTFILE.md') - _, args = parser.parse_args() - if len(args) != 1: - parser.error('Expected an output filename') - out = '\n'.join(ie.description() for ie in list_extractor_classes() if ie.IE_DESC is not False) - - with open(args[0], 'w', encoding='utf-8') as outf: - outf.write(f'# Supported sites\n{out}\n') + write_file(get_filename_args(), f'# Supported sites\n{out}\n') if __name__ == '__main__': diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index cea9349499..9b12e71e5f 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -1,9 +1,22 @@ #!/usr/bin/env python3 -import optparse +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + import os.path import re +from devscripts.utils import ( + compose_functions, + get_filename_args, + read_file, + write_file, +) + ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') @@ -22,25 +35,6 @@ ''' -def main(): - parser = optparse.OptionParser(usage='%prog OUTFILE.md') - _, args = parser.parse_args() - if len(args) != 1: - parser.error('Expected an output filename') - - outfile, = args - - with open(README_FILE, encoding='utf-8') as f: - readme = f.read() - - readme = filter_excluded_sections(readme) - readme = move_sections(readme) - readme = filter_options(readme) - - with open(outfile, 'w', encoding='utf-8') as outf: - outf.write(PREFIX + readme) - - def filter_excluded_sections(readme): EXCLUDED_SECTION_BEGIN_STRING = re.escape('') EXCLUDED_SECTION_END_STRING = re.escape('') @@ -92,5 +86,12 @@ def filter_options(readme): return readme.replace(section, options, 1) +TRANSFORM = compose_functions(filter_excluded_sections, move_sections, filter_options) + + +def main(): + write_file(get_filename_args(), PREFIX + TRANSFORM(read_file(README_FILE))) + + if __name__ == '__main__': main() diff --git a/devscripts/update-formulae.py b/devscripts/update-formulae.py index 96b56b9324..e79297f530 100644 --- a/devscripts/update-formulae.py +++ b/devscripts/update-formulae.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +""" +Usage: python3 ./devscripts/update-formulae.py +version can be either 0-aligned (yt-dlp version) or normalized (PyPi version) +""" + # Allow direct execution import os import sys @@ -11,8 +16,7 @@ import re import urllib.request -# usage: python3 ./devscripts/update-formulae.py -# version can be either 0-aligned (yt-dlp version) or normalized (PyPl version) +from devscripts.utils import read_file, write_file filename, version = sys.argv[1:] @@ -27,11 +31,9 @@ sha256sum = tarball_file['digests']['sha256'] url = tarball_file['url'] -with open(filename) as r: - formulae_text = r.read() +formulae_text = read_file(filename) formulae_text = re.sub(r'sha256 "[0-9a-f]*?"', 'sha256 "%s"' % sha256sum, formulae_text, count=1) formulae_text = re.sub(r'url "[^"]*?"', 'url "%s"' % url, formulae_text, count=1) -with open(filename, 'w') as w: - w.write(formulae_text) +write_file(filename, formulae_text) diff --git a/devscripts/update-version.py b/devscripts/update-version.py index c5bc83de93..c55dd371c5 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -7,32 +7,35 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import contextlib import subprocess import sys from datetime import datetime -with open('yt_dlp/version.py') as f: - exec(compile(f.read(), 'yt_dlp/version.py', 'exec')) -old_version = locals()['__version__'] +from devscripts.utils import read_version, write_file -old_version_list = old_version.split('.') -old_ver = '.'.join(old_version_list[:3]) -old_rev = old_version_list[3] if len(old_version_list) > 3 else '' +def get_new_version(revision): + version = datetime.utcnow().strftime('%Y.%m.%d') -ver = datetime.utcnow().strftime("%Y.%m.%d") + if revision: + assert revision.isdigit(), 'Revision must be a number' + else: + old_version = read_version().split('.') + if version.split('.') == old_version[:3]: + revision = str(int((old_version + [0])[3]) + 1) -rev = (sys.argv[1:] or [''])[0] # Use first argument, if present as revision number -if not rev: - rev = str(int(old_rev or 0) + 1) if old_ver == ver else '' + return f'{version}.{revision}' if revision else version -VERSION = '.'.join((ver, rev)) if rev else ver -try: - sp = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE) - GIT_HEAD = sp.communicate()[0].decode().strip() or None -except Exception: - GIT_HEAD = None +def get_git_head(): + with contextlib.suppress(Exception): + sp = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE) + return sp.communicate()[0].decode().strip() or None + + +VERSION = get_new_version((sys.argv + [''])[1]) +GIT_HEAD = get_git_head() VERSION_FILE = f'''\ # Autogenerated by devscripts/update-version.py @@ -42,8 +45,6 @@ RELEASE_GIT_HEAD = {GIT_HEAD!r} ''' -with open('yt_dlp/version.py', 'wt') as f: - f.write(VERSION_FILE) - -print('::set-output name=ytdlp_version::' + VERSION) +write_file('yt_dlp/version.py', VERSION_FILE) +print(f'::set-output name=ytdlp_version::{VERSION}') print(f'\nVersion = {VERSION}, Git HEAD = {GIT_HEAD}') diff --git a/devscripts/utils.py b/devscripts/utils.py new file mode 100644 index 0000000000..aa17a5f7f6 --- /dev/null +++ b/devscripts/utils.py @@ -0,0 +1,35 @@ +import argparse +import functools + + +def read_file(fname): + with open(fname, encoding='utf-8') as f: + return f.read() + + +def write_file(fname, content): + with open(fname, 'w', encoding='utf-8') as f: + return f.write(content) + + +# Get the version without importing the package +def read_version(fname='yt_dlp/version.py'): + exec(compile(read_file(fname), fname, 'exec')) + return locals()['__version__'] + + +def get_filename_args(has_infile=False, default_outfile=None): + parser = argparse.ArgumentParser() + if has_infile: + parser.add_argument('infile', help='Input file') + kwargs = {'nargs': '?', 'default': default_outfile} if default_outfile else {} + parser.add_argument('outfile', **kwargs, help='Output file') + + opts = parser.parse_args() + if has_infile: + return opts.infile, opts.outfile + return opts.outfile + + +def compose_functions(*functions): + return lambda x: functools.reduce(lambda y, f: f(y), functions, x) diff --git a/pyinst.py b/pyinst.py index 31854e881c..9be5d89604 100644 --- a/pyinst.py +++ b/pyinst.py @@ -1,11 +1,17 @@ #!/usr/bin/env python3 +# Allow direct execution import os -import platform import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import platform + from PyInstaller.__main__ import run as run_pyinstaller +from devscripts.utils import read_version + OS_NAME, MACHINE, ARCH = sys.platform, platform.machine(), platform.architecture()[0][:2] if MACHINE in ('x86_64', 'AMD64') or ('i' in MACHINE and '86' in MACHINE): # NB: Windows x86 has MACHINE = AMD64 irrespective of bitness @@ -13,8 +19,7 @@ def main(): - opts = parse_options() - version = read_version('yt_dlp/version.py') + opts, version = parse_options(), read_version() onedir = '--onedir' in opts or '-D' in opts if not onedir and '-F' not in opts and '--onefile' not in opts: @@ -53,13 +58,6 @@ def parse_options(): return opts -# Get the version from yt_dlp/version.py without importing the package -def read_version(fname): - with open(fname, encoding='utf-8') as f: - exec(compile(f.read(), fname, 'exec')) - return locals()['__version__'] - - def exe(onedir): """@returns (name, path)""" name = '_'.join(filter(None, ( diff --git a/setup.py b/setup.py index dab09c268c..aebe1dead9 100644 --- a/setup.py +++ b/setup.py @@ -12,28 +12,18 @@ from distutils.core import Command, setup setuptools_available = False +from devscripts.utils import read_file, read_version -def read(fname): - with open(fname, encoding='utf-8') as f: - return f.read() - - -# Get the version from yt_dlp/version.py without importing the package -def read_version(fname): - exec(compile(read(fname), fname, 'exec')) - return locals()['__version__'] - - -VERSION = read_version('yt_dlp/version.py') +VERSION = read_version() DESCRIPTION = 'A youtube-dl fork with additional features and patches' LONG_DESCRIPTION = '\n\n'.join(( 'Official repository: ', '**PS**: Some links in this document will not work since this is a copy of the README.md from Github', - read('README.md'))) + read_file('README.md'))) -REQUIREMENTS = read('requirements.txt').splitlines() +REQUIREMENTS = read_file('requirements.txt').splitlines() def packages(): @@ -121,7 +111,7 @@ def run(self): if self.dry_run: print('Skipping build of lazy extractors in dry run mode') return - subprocess.run([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py']) + subprocess.run([sys.executable, 'devscripts/make_lazy_extractors.py']) params = py2exe_params() if sys.argv[1:2] == ['py2exe'] else build_params() From 70b2340909d8d917f71d20181614fd7392d3f7f0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 29 Jul 2022 20:33:01 +0530 Subject: [PATCH 010/284] [build, devscripts] Add devscript to set a build variant Closes #4471 --- .github/workflows/build.yml | 1 + README.md | 7 ++++--- devscripts/make_readme.py | 4 ++++ devscripts/set-variant.py | 36 ++++++++++++++++++++++++++++++++++++ devscripts/update-version.py | 4 ++++ yt_dlp/YoutubeDL.py | 4 +++- yt_dlp/options.py | 9 ++++++--- yt_dlp/update.py | 13 ++++++++----- yt_dlp/version.py | 4 ++++ 9 files changed, 70 insertions(+), 12 deletions(-) create mode 100644 devscripts/set-variant.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f3cc9930d5..bd343d95d3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -89,6 +89,7 @@ jobs: if: "env.TWINE_PASSWORD != ''" run: | rm -rf dist/* + python devscripts/set-variant.py pip -M "You installed yt-dlp with pip or using the wheel from PyPi; Use that to update" python setup.py sdist bdist_wheel twine upload dist/* diff --git a/README.md b/README.md index 0a6dd53d73..e38c6981a9 100644 --- a/README.md +++ b/README.md @@ -343,7 +343,8 @@ ### Standalone Py2Exe Builds (Windows) ### Related scripts -* **`devscripts/update-version.py`** - Update the version number based on current timestamp +* **`devscripts/update-version.py [revision]`** - Update the version number based on current date +* **`devscripts/set-variant.py variant [-M update_message]`** - Set the build variant of the executable * **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS=1` if you wish to forcefully disable lazy extractor loading. You can also fork the project on github and run your fork's [build workflow](.github/workflows/build.yml) to automatically build a full release @@ -360,8 +361,8 @@ # USAGE AND OPTIONS ## General Options: -h, --help Print this help text and exit --version Print program version and exit - -U, --update Update this program to latest version - --no-update Do not update (default) + -U, --update Update this program to the latest version + --no-update Do not check for updates (default) -i, --ignore-errors Ignore download and postprocessing errors. The download will be considered successful even if the postprocessing fails diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index 767ea5409f..fad993a199 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -45,6 +45,10 @@ def apply_patch(text, patch): delim = f'\n{" " * switch_col_width}' PATCHES = ( + ( # Standardize update message + r'(?m)^( -U, --update\s+).+(\n \s.+)*$', + r'\1Update this program to the latest version', + ), ( # Headings r'(?m)^ (\w.+\n)( (?=\w))?', r'## \1' diff --git a/devscripts/set-variant.py b/devscripts/set-variant.py new file mode 100644 index 0000000000..10341e7444 --- /dev/null +++ b/devscripts/set-variant.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +import argparse +import functools +import re + +from devscripts.utils import compose_functions, read_file, write_file + +VERSION_FILE = 'yt_dlp/version.py' + + +def parse_options(): + parser = argparse.ArgumentParser(description='Set the build variant of the package') + parser.add_argument('variant', help='Name of the variant') + parser.add_argument('-M', '--update-message', default=None, help='Message to show in -U') + return parser.parse_args() + + +def property_setter(name, value): + return functools.partial(re.sub, rf'(?m)^{name}\s*=\s*.+$', f'{name} = {value!r}') + + +opts = parse_options() +transform = compose_functions( + property_setter('VARIANT', opts.variant), + property_setter('UPDATE_HINT', opts.update_message) +) + +write_file(VERSION_FILE, transform(read_file(VERSION_FILE))) diff --git a/devscripts/update-version.py b/devscripts/update-version.py index c55dd371c5..caebf42414 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -43,6 +43,10 @@ def get_git_head(): __version__ = {VERSION!r} RELEASE_GIT_HEAD = {GIT_HEAD!r} + +VARIANT = None + +UPDATE_HINT = None ''' write_file('yt_dlp/version.py', VERSION_FILE) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index ded34b8edc..228aa7bf5e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -144,7 +144,7 @@ write_json_file, write_string, ) -from .version import RELEASE_GIT_HEAD, __version__ +from .version import RELEASE_GIT_HEAD, VARIANT, __version__ if compat_os_name == 'nt': import ctypes @@ -3676,6 +3676,8 @@ def get_encoding(stream): write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') source = detect_variant() + if VARIANT not in (None, 'pip'): + source += '*' write_debug(join_nonempty( 'yt-dlp version', __version__, f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', diff --git a/yt_dlp/options.py b/yt_dlp/options.py index b70f5798e3..2c7f686dde 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -20,12 +20,13 @@ SponsorBlockPP, ) from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE -from .update import detect_variant +from .update import detect_variant, is_non_updateable from .utils import ( OUTTMPL_TYPES, POSTPROCESS_WHEN, Config, expand_path, + format_field, get_executable_path, join_nonempty, remove_end, @@ -333,11 +334,13 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): general.add_option( '-U', '--update', action='store_true', dest='update_self', - help='Update this program to latest version') + help=format_field( + is_non_updateable(), None, 'Check if updates are available. %s', + default='Update this program to the latest version')) general.add_option( '--no-update', action='store_false', dest='update_self', - help='Do not update (default)') + help='Do not check for updates (default)') general.add_option( '-i', '--ignore-errors', action='store_true', dest='ignoreerrors', diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 92c07acc14..a04518c9b6 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -18,7 +18,7 @@ traverse_obj, version_tuple, ) -from .version import __version__ +from .version import UPDATE_HINT, VARIANT, __version__ REPOSITORY = 'yt-dlp/yt-dlp' API_URL = f'https://api.github.com/repos/{REPOSITORY}/releases' @@ -47,7 +47,7 @@ def _get_variant_and_executable_path(): def detect_variant(): - return _get_variant_and_executable_path()[0] + return VARIANT or _get_variant_and_executable_path()[0] _FILE_SUFFIXES = { @@ -64,13 +64,16 @@ def detect_variant(): **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, 'source': 'You cannot update when running from source code; Use git to pull the latest changes', - 'unknown': 'It looks like you installed yt-dlp with a package manager, pip or setup.py; Use that to update', - 'other': 'It looks like you are using an unofficial build of yt-dlp; Build the executable again', + 'unknown': 'You installed yt-dlp with a package manager or setup.py; Use that to update', + 'other': 'You are using an unofficial build of yt-dlp; Build the executable again', } def is_non_updateable(): - return _NON_UPDATEABLE_REASONS.get(detect_variant(), _NON_UPDATEABLE_REASONS['other']) + if UPDATE_HINT: + return UPDATE_HINT + return _NON_UPDATEABLE_REASONS.get( + detect_variant(), _NON_UPDATEABLE_REASONS['unknown' if VARIANT else 'other']) def _sha256_file(path): diff --git a/yt_dlp/version.py b/yt_dlp/version.py index a1a5880e95..75ede4973c 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -3,3 +3,7 @@ __version__ = '2022.07.18' RELEASE_GIT_HEAD = '135f05ef6' + +VARIANT = None + +UPDATE_HINT = None From f0ad6f8c510449bf79c818bafd27779f24e2fbbc Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 9 Aug 2022 01:49:28 +0530 Subject: [PATCH 011/284] Remove filtered entries from `-J` Closes #4369 --- yt_dlp/YoutubeDL.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 228aa7bf5e..2b7af4cd7e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1797,6 +1797,8 @@ def __process_playlist(self, ie_result, download): }) if self._match_entry(entry_copy, incomplete=True) is not None: + # For compatabilty with youtube-dl. See https://github.com/yt-dlp/yt-dlp/issues/4369 + resolved_entries[i] = (playlist_index, NO_DEFAULT) continue self.to_screen('[download] Downloading video %s of %s' % ( @@ -1817,7 +1819,8 @@ def __process_playlist(self, ie_result, download): resolved_entries[i] = (playlist_index, entry_result) # Update with processed data - ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT] + ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT] # Write the updated info to json if _infojson_written is True and self._write_info_json( From e251986cbe7c62a7bef02a1a32bae21dff25565e Mon Sep 17 00:00:00 2001 From: Eren Kemer Date: Mon, 8 Aug 2022 23:09:37 +0200 Subject: [PATCH 012/284] [extractor/harpodeon] Add extractor (#4540) Closes #4450 Authored by: eren-kemer --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/harpodeon.py | 70 +++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 yt_dlp/extractor/harpodeon.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3abae19b01..0bb685fa29 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -631,6 +631,7 @@ GronkhVodsIE ) from .groupon import GrouponIE +from .harpodeon import HarpodeonIE from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE diff --git a/yt_dlp/extractor/harpodeon.py b/yt_dlp/extractor/harpodeon.py new file mode 100644 index 0000000000..0aa47337ff --- /dev/null +++ b/yt_dlp/extractor/harpodeon.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import unified_strdate + + +class HarpodeonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '727371564a6a9ebccef2073535b5b6bd', + 'skip': 'Free video could become unavailable', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '6dfea5412845f690c7331be703f884db', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710', + 'md5': '7979df9ca04637282cb7d172ab3a9c3b', + 'info_dict': { + 'id': '421838710', + 'ext': 'mp4', + 'title': 'Behind the Screen', + 'description': 'md5:008972a3dc51fba3965ee517d2ba9155', + 'creator': 'Lone Star Corporation', + 'release_date': '19160101' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title, creator, release_year = self._search_regex( + r'''(?x) + ]+videoInfo[^<]*]*>(?P[^>]+)</h2> + (?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''', + webpage, 'title', group=('title', 'creator', 'release_year'), + fatal=False) or (None, None, None) + + hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base') + + hp_inject_video, hp_resolution = self._search_regex( + r'''(?x) + hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"], + [\'\"](?P<hp_resolution>\d+)[\'\"]''', + webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution']) + + return { + 'id': video_id, + 'title': title, + 'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4', + 'http_headers': {'Referer': url}, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'creator': creator, + 'release_date': unified_strdate(f'{release_year}0101') + } From 2a5e5477bcb70d62de20556924a405857d071e09 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Mon, 8 Aug 2022 16:11:47 -0500 Subject: [PATCH 013/284] [extractor/redbee] Unify and update extractors (#4479) Closes #4443 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/parliamentliveuk.py | 77 ------ yt_dlp/extractor/redbee.py | 361 +++++++++++++++++++++++++++ yt_dlp/extractor/rtbf.py | 156 ------------ 4 files changed, 362 insertions(+), 235 deletions(-) delete mode 100644 yt_dlp/extractor/parliamentliveuk.py create mode 100644 yt_dlp/extractor/redbee.py delete mode 100644 yt_dlp/extractor/rtbf.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0bb685fa29..73795ddc5f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1236,7 +1236,6 @@ ParamountPlusIE, ParamountPlusSeriesIE, ) -from .parliamentliveuk import ParliamentLiveUKIE from .parlview import ParlviewIE from .patreon import ( PatreonIE, @@ -1407,6 +1406,7 @@ RCTIPlusTVIE, ) from .rds import RDSIE +from .redbee import ParliamentLiveUKIE, RTBFIE from .redbulltv import ( RedBullTVIE, RedBullEmbedIE, @@ -1440,7 +1440,6 @@ from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE from .rozhlas import RozhlasIE -from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import ( RtlNlIE, diff --git a/yt_dlp/extractor/parliamentliveuk.py b/yt_dlp/extractor/parliamentliveuk.py deleted file mode 100644 index 38cb031645..0000000000 --- a/yt_dlp/extractor/parliamentliveuk.py +++ /dev/null @@ -1,77 +0,0 @@ -import json -import uuid - -from .common import InfoExtractor -from ..utils import ( - unified_timestamp, - try_get, -) - - -class ParliamentLiveUKIE(InfoExtractor): - IE_NAME = 'parliamentlive.tv' - IE_DESC = 'UK parliament videos' - _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - - _TESTS = [{ - 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', - 'info_dict': { - 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', - 'ext': 'mp4', - 'title': 'Home Affairs Committee', - 'timestamp': 1395153872, - 'upload_date': '20140318', - }, - }, { - 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json(f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id) - _DEVICE_ID = str(uuid.uuid4()) - auth = 'Bearer ' + self._download_json( - 'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/auth/anonymous', - video_id, headers={ - 'Origin': 'https://videoplayback.parliamentlive.tv', - 'Accept': 'application/json, text/plain, */*', - 'Content-Type': 'application/json;charset=utf-8' - }, data=json.dumps({ - 'deviceId': _DEVICE_ID, - 'device': { - 'deviceId': _DEVICE_ID, - 'width': 653, - 'height': 368, - 'type': 'WEB', - 'name': ' Mozilla Firefox 91' - } - }).encode('utf-8'))['sessionToken'] - - video_urls = self._download_json( - f'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/entitlement/{video_id}/play', - video_id, headers={'Authorization': auth, 'Accept': 'application/json, text/plain, */*'})['formats'] - - formats = [] - for format in video_urls: - if not format.get('mediaLocator'): - continue - if format.get('format') == 'DASH': - formats.extend(self._extract_mpd_formats( - format['mediaLocator'], video_id, mpd_id='dash', fatal=False)) - elif format.get('format') == 'SMOOTHSTREAMING': - formats.extend(self._extract_ism_formats( - format['mediaLocator'], video_id, ism_id='ism', fatal=False)) - elif format.get('format') == 'HLS': - formats.extend(self._extract_m3u8_formats( - format['mediaLocator'], video_id, m3u8_id='hls', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_info['event']['title'], - 'timestamp': unified_timestamp(try_get(video_info, lambda x: x['event']['publishedStartTime'])), - 'thumbnail': video_info.get('thumbnailUrl'), - } diff --git a/yt_dlp/extractor/redbee.py b/yt_dlp/extractor/redbee.py new file mode 100644 index 0000000000..dc8b272fc0 --- /dev/null +++ b/yt_dlp/extractor/redbee.py @@ -0,0 +1,361 @@ +import json +import re +import time +import urllib.parse +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, +) + + +class RedBeeBaseIE(InfoExtractor): + _DEVICE_ID = str(uuid.uuid4()) + + @property + def _API_URL(self): + """ + Ref: https://apidocs.emp.ebsd.ericsson.net + Subclasses must set _REDBEE_CUSTOMER, _REDBEE_BUSINESS_UNIT + """ + return f'https://exposure.api.redbee.live/v2/customer/{self._REDBEE_CUSTOMER}/businessunit/{self._REDBEE_BUSINESS_UNIT}' + + def _get_bearer_token(self, asset_id, jwt=None): + request = { + 'deviceId': self._DEVICE_ID, + 'device': { + 'deviceId': self._DEVICE_ID, + 'name': 'Mozilla Firefox 102', + 'type': 'WEB', + }, + } + if jwt: + request['jwt'] = jwt + + return self._download_json( + f'{self._API_URL}/auth/{"gigyaLogin" if jwt else "anonymous"}', + asset_id, data=json.dumps(request).encode('utf-8'), headers={ + 'Content-Type': 'application/json;charset=utf-8' + })['sessionToken'] + + def _get_formats_and_subtitles(self, asset_id, **kwargs): + bearer_token = self._get_bearer_token(asset_id, **kwargs) + api_response = self._download_json( + f'{self._API_URL}/entitlement/{asset_id}/play', + asset_id, headers={ + 'Authorization': f'Bearer {bearer_token}', + 'Accept': 'application/json, text/plain, */*' + }) + + formats, subtitles = [], {} + for format in api_response['formats']: + if not format.get('mediaLocator'): + continue + + fmts, subs = [], {} + if format.get('format') == 'DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + elif format.get('format') == 'SMOOTHSTREAMING': + fmts, subs = self._extract_ism_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + elif format.get('format') == 'HLS': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return formats, subtitles + + +class ParliamentLiveUKIE(RedBeeBaseIE): + IE_NAME = 'parliamentlive.tv' + IE_DESC = 'UK parliament videos' + _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _REDBEE_CUSTOMER = 'UKParliament' + _REDBEE_BUSINESS_UNIT = 'ParliamentLive' + + _TESTS = [{ + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'info_dict': { + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'timestamp': 1395153872, + 'upload_date': '20140318', + 'thumbnail': r're:https?://[^?#]+c1e9d44d-fd6c-4263-b50f-97ed26cc998b[^/]*/thumbnail', + }, + }, { + 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', + 'only_matching': True, + }, { + 'url': 'https://parliamentlive.tv/Event/Index/27cf25e4-e77b-42a3-93c5-c815cd6d7377', + 'info_dict': { + 'id': '27cf25e4-e77b-42a3-93c5-c815cd6d7377', + 'ext': 'mp4', + 'title': 'House of Commons', + 'timestamp': 1658392447, + 'upload_date': '20220721', + 'thumbnail': r're:https?://[^?#]+27cf25e4-e77b-42a3-93c5-c815cd6d7377[^/]*/thumbnail', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats, subtitles = self._get_formats_and_subtitles(video_id) + self._sort_formats(formats) + + video_info = self._download_json( + f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id, fatal=False) + + self._sort_formats(formats, ['res', 'proto']) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(video_info, ('event', 'title')), + 'thumbnail': traverse_obj(video_info, 'thumbnailUrl'), + 'timestamp': traverse_obj( + video_info, ('event', 'publishedStartTime'), expected_type=unified_timestamp), + } + + +class RTBFIE(RedBeeBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?P<live>l)?id= + )(?P<id>\d+)''' + _NETRC_MACHINE = 'rtbf' + + _REDBEE_CUSTOMER = 'RTBF' + _REDBEE_BUSINESS_UNIT = 'Auvio' + + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '8c876a1cceeb6cf31b476461ade72384', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, + }, + 'skip': 'No longer available', + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, + }, { + 'url': 'https://www.rtbf.be/auvio/detail_investigation?id=2921926', + 'md5': 'd5d11bb62169fef38d7ce7ac531e034f', + 'info_dict': { + 'id': '2921926', + 'ext': 'mp4', + 'title': 'Le handicap un confinement perpétuel - Maladie de Lyme', + 'description': 'md5:dcbd5dcf6015488c9069b057c15ccc52', + 'duration': 5258.8, + 'upload_date': '20220727', + 'timestamp': 1658934000, + 'series': '#Investigation', + 'thumbnail': r're:^https?://[^?&]+\.jpg$', + }, + }, { + 'url': 'https://www.rtbf.be/auvio/detail_la-belgique-criminelle?id=2920492', + 'md5': '054f9f143bc79c89647c35e5a7d35fa8', + 'info_dict': { + 'id': '2920492', + 'ext': 'mp4', + 'title': '04 - Le crime de la rue Royale', + 'description': 'md5:0c3da1efab286df83f2ab3f8f96bd7a6', + 'duration': 1574.6, + 'upload_date': '20220723', + 'timestamp': 1658596887, + 'series': 'La Belgique criminelle - TV', + 'thumbnail': r're:^https?://[^?&]+\.jpg$', + }, + }] + + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } + _QUALITIES = [ + ('mobile', 'SD'), + ('web', 'MD'), + ('high', 'HD'), + ] + _LOGIN_URL = 'https://login.rtbf.be/accounts.login' + _GIGYA_API_KEY = '3_kWKuPgcdAybqnqxq_MvHVk0-6PN8Zk8pIIkJM_yXOu-qLPDDsGOtIDFfpGivtbeO' + _LOGIN_COOKIE_ID = f'glt_{_GIGYA_API_KEY}' + + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get(self._LOGIN_COOKIE_ID): + return + + self._set_cookie('.rtbf.be', 'gmid', 'gmid.ver4', secure=True, expire_time=time.time() + 3600) + + login_response = self._download_json( + self._LOGIN_URL, None, data=urllib.parse.urlencode({ + 'loginID': username, + 'password': password, + 'APIKey': self._GIGYA_API_KEY, + 'targetEnv': 'jssdk', + 'sessionExpiration': '-2', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + if login_response['statusCode'] != 200: + raise ExtractorError('Login failed. Server message: %s' % login_response['errorMessage'], expected=True) + + self._set_cookie('.rtbf.be', self._LOGIN_COOKIE_ID, login_response['sessionInfo']['login_token'], + secure=True, expire_time=time.time() + 3600) + + def _get_formats_and_subtitles(self, url, media_id): + login_token = self._get_cookies(url).get(self._LOGIN_COOKIE_ID) + if not login_token: + self.raise_login_required() + + session_jwt = self._download_json( + 'https://login.rtbf.be/accounts.getJWT', media_id, query={ + 'login_token': login_token.value, + 'APIKey': self._GIGYA_API_KEY, + 'sdk': 'js_latest', + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': '13273', + 'format': 'json', + })['id_token'] + + return super()._get_formats_and_subtitles(media_id, jwt=session_jwt) + + def _real_extract(self, url): + live, media_id = self._match_valid_url(url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = data['subtitle'] + is_live = data.get('isLive') + height_re = r'-(\d+)p\.' + formats = [] + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': fix_url(format_url), + 'height': height, + }) + + mpd_url = data.get('urlDash') + if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + + if not formats: + fmts, subs = self._get_formats_and_subtitles(url, media_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + self._sort_formats(formats, ['res', 'proto']) + return { + 'id': media_id, + 'formats': formats, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/rtbf.py b/yt_dlp/extractor/rtbf.py deleted file mode 100644 index a300a24824..0000000000 --- a/yt_dlp/extractor/rtbf.py +++ /dev/null @@ -1,156 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - strip_or_none, -) - - -class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?:www\.)?rtbf\.be/ - (?: - video/[^?]+\?.*\bid=| - ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*\b(?P<live>l)?id= - )(?P<id>\d+)''' - _TESTS = [{ - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '8c876a1cceeb6cf31b476461ade72384', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'description': '(du 25/04/2014)', - 'duration': 3099.54, - 'upload_date': '20140425', - 'timestamp': 1398456300, - } - }, { - # geo restricted - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'only_matching': True, - }, { - 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', - 'only_matching': True, - }, { - 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', - 'only_matching': True, - }, { - # Live - 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', - 'only_matching': True, - }, { - # Audio - 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', - 'only_matching': True, - }, { - # With Subtitle - 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', - 'only_matching': True, - }] - _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' - _PROVIDERS = { - 'YOUTUBE': 'Youtube', - 'DAILYMOTION': 'Dailymotion', - 'VIMEO': 'Vimeo', - } - _QUALITIES = [ - ('mobile', 'SD'), - ('web', 'MD'), - ('high', 'HD'), - ] - - def _real_extract(self, url): - live, media_id = self._match_valid_url(url).groups() - embed_page = self._download_webpage( - 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), - media_id, query={'id': media_id}) - data = self._parse_json(self._html_search_regex( - r'data-media="([^"]+)"', embed_page, 'media data'), media_id) - - error = data.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - provider = data.get('provider') - if provider in self._PROVIDERS: - return self.url_result(data['url'], self._PROVIDERS[provider]) - - title = data['title'] - is_live = data.get('isLive') - height_re = r'-(\d+)p\.' - formats = [] - - m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x - http_url = data.get('url') - if formats and http_url and re.search(height_re, http_url): - http_url = fix_url(http_url) - for m3u8_f in formats[:]: - height = m3u8_f.get('height') - if not height: - continue - f = m3u8_f.copy() - del f['protocol'] - f.update({ - 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), - 'url': re.sub(height_re, '-%dp.' % height, http_url), - }) - formats.append(f) - else: - sources = data.get('sources') or {} - for key, format_id in self._QUALITIES: - format_url = sources.get(key) - if not format_url: - continue - height = int_or_none(self._search_regex( - height_re, format_url, 'height', default=None)) - formats.append({ - 'format_id': format_id, - 'url': fix_url(format_url), - 'height': height, - }) - - mpd_url = data.get('urlDash') - if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): - formats.extend(self._extract_mpd_formats( - mpd_url, media_id, mpd_id='dash', fatal=False)) - - audio_url = data.get('urlAudio') - if audio_url: - formats.append({ - 'format_id': 'audio', - 'url': audio_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) - - subtitles = {} - for track in (data.get('tracks') or {}).values(): - sub_url = track.get('url') - if not sub_url: - continue - subtitles.setdefault(track.get('lang') or 'fr', []).append({ - 'url': sub_url, - }) - - return { - 'id': media_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(data.get('description')), - 'thumbnail': data.get('thumbnail'), - 'duration': float_or_none(data.get('realDuration')), - 'timestamp': int_or_none(data.get('liveFrom')), - 'series': data.get('programLabel'), - 'subtitles': subtitles, - 'is_live': is_live, - } From 16d4535abc99d81c3a59314e644b4af6c604e805 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 9 Aug 2022 02:54:19 +0530 Subject: [PATCH 014/284] Update to ytdl-commit-adb5294 [aenetworks] Update _THEPLATFORM_KEY and _THEPLATFORM_SECRET https://github.com/ytdl-org/youtube-dl/commit/adb5294177265ba35b45746dbb600965076ed150 --- README.md | 2 +- yt_dlp/extractor/mediaset.py | 4 ++++ yt_dlp/extractor/vvvvid.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e38c6981a9..57848ff795 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/a03b977](https://github.com/ytdl-org/youtube-dl/commit/a03b9775d544b06a5b4f2aa630214c7c22fc2229)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17+ [commit/adb5294](https://github.com/ytdl-org/youtube-dl/commit/adb5294177265ba35b45746dbb600965076ed150)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 4e549fe5e2..0671c29a66 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -141,6 +141,10 @@ class MediasetIE(ThePlatformBaseIE): # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, + }, { + # embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/) + 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/', + 'only_matching': True, }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, diff --git a/yt_dlp/extractor/vvvvid.py b/yt_dlp/extractor/vvvvid.py index ccc44d08a3..f0156d10ca 100644 --- a/yt_dlp/extractor/vvvvid.py +++ b/yt_dlp/extractor/vvvvid.py @@ -61,6 +61,18 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/dash' + 'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi', + 'info_dict': { + 'id': '693786', + 'ext': 'mp4', + 'title': 'Nanachi', + }, + 'params': { + 'skip_download': True, + 'format': 'mp4', + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -202,6 +214,9 @@ def metadata_from_url(r_url): }) is_youtube = True break + elif video_type == 'video/dash': + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) From 3157158f7609155906152b8f18d43245d4ee426e Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 9 Aug 2022 03:35:17 +0530 Subject: [PATCH 015/284] Release 2022.08.08 --- CONTRIBUTORS | 9 +++++ Changelog.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++ supportedsites.md | 31 ++++++++--------- 3 files changed, 111 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 47559aa341..cf9b0ea544 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -285,3 +285,12 @@ odo2063 pritam20ps05 scy sheerluck +AxiosDeminence +DjesonPV +eren-kemer +freezboltz +Galiley +haobinliang +Mehavoid +winterbird-code +yashkc2025 diff --git a/Changelog.md b/Changelog.md index 74311052f2..bed128c3d2 100644 --- a/Changelog.md +++ b/Changelog.md @@ -11,6 +11,92 @@ # Instuctions for creating release --> +### 2022.08.08 + +* **Remove Python 3.6 support** +* Determine merge container better by [pukkandan](https://github.com/pukkandan), [selfisekai](https://github.com/selfisekai) +* Framework for embed detection by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* Merge youtube-dl: Upto [commit/adb5294](https://github.com/ytdl-org/youtube-dl/commit/adb5294) +* `--compat-option no-live-chat` should disable danmaku +* Fix misleading DRM message +* Import ctypes only when necessary +* Minor bugfixes by [pukkandan](https://github.com/pukkandan) +* Reject entire playlists faster with `--match-filter` by [pukkandan](https://github.com/pukkandan) +* Remove filtered entries from `-J` +* Standardize retry mechanism by [pukkandan](https://github.com/pukkandan) +* Validate `--merge-output-format` +* [downloader] Add average speed to final progress line +* [extractor] Add field `audio_channels` +* [extractor] Support multiple archive ids for one video +* [ffmpeg] Set `ffmpeg_location` in a contextvar +* [FFmpegThumbnailsConvertor] Fix conversion from GIF +* [MetadataParser] Don't set `None` when the field didn't match +* [outtmpl] Smarter replacing of unsupported characters by [pukkandan](https://github.com/pukkandan) +* [outtmpl] Treat empty values as None in filenames +* [utils] sanitize_open: Allow any IO stream as stdout +* [build, devscripts] Add devscript to set a build variant +* [build] Improve build process by [shirt-dev](https://github.com/shirt-dev) +* [build] Update pyinstaller +* [devscripts] Create `utils` and refactor +* [docs] Clarify `best*` +* [docs] Fix bug report issue template +* [docs] Fix capitalization in references by [christoph-heinrich](https://github.com/christoph-heinrich) +* [cleanup, mhtml] Use imghdr +* [cleanup, utils] Consolidate known media extensions +* [cleanup] Misc fixes and cleanup +* [extractor/angel] Add extractor by [AxiosDeminence](https://github.com/AxiosDeminence) +* [extractor/dplay] Add MotorTrend extractor by [Sipherdrakon](https://github.com/Sipherdrakon) +* [extractor/harpodeon] Add extractor by [eren-kemer](https://github.com/eren-kemer) +* [extractor/holodex] Add extractor by [pukkandan](https://github.com/pukkandan), [sqrtNOT](https://github.com/sqrtNOT) +* [extractor/kompas] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/rai] Add raisudtirol extractor by [nixxo](https://github.com/nixxo) +* [extractor/tempo] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/youtube] **Fixes for third party client detection** by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube] Add `live_status=post_live` by [lazypete365](https://github.com/lazypete365) +* [extractor/youtube] Extract more format info +* [extractor/youtube] Parse translated subtitles only when requested +* [extractor/youtube, extractor/twitch] Allow waiting for channels to become live +* [extractor/youtube, webvtt] Extract auto-subs from livestream VODs by [fstirlitz](https://github.com/fstirlitz), [pukkandan](https://github.com/pukkandan) +* [extractor/AbemaTVTitle] Implement paging by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/archiveorg] Improve handling of formats by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor/arte] Fix title extraction +* [extractor/arte] **Move to v2 API** by [fstirlitz](https://github.com/fstirlitz), [pukkandan](https://github.com/pukkandan) +* [extractor/bbc] Fix news articles by [ajj8](https://github.com/ajj8) +* [extractor/camtasia] Separate into own extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/cloudflarestream] Fix video_id padding by [haobinliang](https://github.com/haobinliang) +* [extractor/crunchyroll] Fix conversion of thumbnail from GIF by [pukkandan](https://github.com/pukkandan) +* [extractor/crunchyroll] Handle missing metadata correctly by [Burve](https://github.com/Burve), [pukkandan](https://github.com/pukkandan) +* [extractor/crunchyroll:beta] Extract timestamp and fix tests by [tejing1](https://github.com/tejing1) +* [extractor/crunchyroll:beta] Use streams API by [tejing1](https://github.com/tejing1) +* [extractor/doodstream] Support more domains by [Galiley](https://github.com/Galiley) +* [extractor/ESPN] Extract duration by [ischmidt20](https://github.com/ischmidt20) +* [extractor/FIFA] Change API endpoint by [Bricio](https://github.com/Bricio), [yashkc2025](https://github.com/yashkc2025) +* [extractor/globo:article] Remove false positives by [Bricio](https://github.com/Bricio) +* [extractor/Go] Extract timestamp by [ischmidt20](https://github.com/ischmidt20) +* [extractor/hidive] Fix cookie login when netrc is also given by [winterbird-code](https://github.com/winterbird-code) +* [extractor/html5] Separate into own extractor by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor/ina] Improve extractor by [elyse0](https://github.com/elyse0) +* [extractor/NaverNow] Change endpoint by [ping](https://github.com/ping) +* [extractor/ninegag] Extract uploader by [DjesonPV](https://github.com/DjesonPV) +* [extractor/NovaPlay] Fix extractor by [Bojidarist](https://github.com/Bojidarist) +* [extractor/orf:radio] Rewrite extractors +* [extractor/patreon] Fix and improve extractors by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor/rai] Fix RaiNews extraction by [nixxo](https://github.com/nixxo) +* [extractor/redbee] Unify and update extractors by [elyse0](https://github.com/elyse0) +* [extractor/stripchat] Fix _VALID_URL by [freezboltz](https://github.com/freezboltz) +* [extractor/tubi] Exclude playlists from playlist entries by [sqrtNOT](https://github.com/sqrtNOT) +* [extractor/tviplayer] Improve `_VALID_URL` by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/twitch] Extract chapters for single chapter VODs by [mpeter50](https://github.com/mpeter50) +* [extractor/vgtv] Support tv.vg.no by [sqrtNOT](https://github.com/sqrtNOT) +* [extractor/vidio] Support embed link by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/vk] Fix extractor by [Mehavoid](https://github.com/Mehavoid) +* [extractor/WASDTV:record] Fix `_VALID_URL` +* [extractor/xfileshare] Add Referer by [Galiley](https://github.com/Galiley) +* [extractor/YahooJapanNews] Fix extractor by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/yandexmusic] Extract higher quality format +* [extractor/zee5] Update Device ID by [m4tu4g](https://github.com/m4tu4g) + + ### 2022.07.18 * Allow users to specify encoding in each config files by [Lesmiscore](https://github.com/Lesmiscore) diff --git a/supportedsites.md b/supportedsites.md index d23e46e3dc..be4fecf4aa 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -18,7 +18,7 @@ # Supported sites - **8tracks** - **91porn** - **9c9media** - - **9gag** + - **9gag**: 9GAG - **9now.com.au** - **abc.net.au** - **abc.net.au:iview** @@ -64,6 +64,7 @@ # Supported sites - **AmericasTestKitchenSeason** - **AmHistoryChannel** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **Angel** - **AnimalPlanet** - **AnimeOnDemand**: [<abbr title="netrc machine"><em>animeondemand</em></abbr>] - **ant1newsgr:article**: ant1news.gr articles @@ -187,6 +188,7 @@ # Supported sites - **Camdemy** - **CamdemyFolder** - **CamModels** + - **CamtasiaEmbed** - **CamWithHer** - **CanalAlpha** - **canalc2.tv** @@ -232,6 +234,7 @@ # Supported sites - **Clippit** - **ClipRs** - **Clipsyndicate** + - **ClipYouEmbed** - **CloserToTruth** - **CloudflareStream** - **Cloudy** @@ -473,6 +476,7 @@ # Supported sites - **gronkh:feed** - **gronkh:vods** - **Groupon** + - **Harpodeon** - **hbo** - **HearThisAt** - **Heise** @@ -491,6 +495,7 @@ # Supported sites - **hitbox:live** - **HitRecord** - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau + - **Holodex** - **HotNewHipHop** - **hotstar** - **hotstar:playlist** @@ -502,6 +507,7 @@ # Supported sites - **HRTiPlaylist**: [<abbr title="netrc machine"><em>hrti</em></abbr>] - **HSEProduct** - **HSEShow** + - **html5** - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post - **Hungama** @@ -573,6 +579,7 @@ # Supported sites - **KickStarter** - **KinjaEmbed** - **KinoPoisk** + - **KompasVideo** - **KonserthusetPlay** - **Koo** - **KrasView**: Красвью @@ -715,6 +722,7 @@ # Supported sites - **Motherless** - **MotherlessGroup** - **Motorsport**: motorsport.com + - **MotorTrend** - **MovieClips** - **MovieFap** - **Moviepilot** @@ -890,21 +898,10 @@ # Supported sites - **openrec:capture** - **openrec:movie** - **OraTV** - - **orf:burgenland**: Radio Burgenland - - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - - **orf:kaernten**: Radio Kärnten - - **orf:noe**: Radio Niederösterreich - - **orf:oberoesterreich**: Radio Oberösterreich - - **orf:oe1**: Radio Österreich 1 - - **orf:oe3**: Radio Österreich 3 - - **orf:salzburg**: Radio Salzburg - - **orf:steiermark**: Radio Steiermark - - **orf:tirol**: Radio Tirol + - **orf:radio** - **orf:tvthek**: ORF TVthek - - **orf:vorarlberg**: Radio Vorarlberg - - **orf:wien**: Radio Wien - **OsnatelTV**: [<abbr title="netrc machine"><em>osnateltv</em></abbr>] - **OutsideTV** - **PacktPub**: [<abbr title="netrc machine"><em>packtpub</em></abbr>] @@ -922,7 +919,7 @@ # Supported sites - **parliamentlive.tv**: UK parliament videos - **Parlview** - **Patreon** - - **PatreonUser** + - **PatreonCampaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **PearVideo** - **PeekVids** @@ -1030,12 +1027,14 @@ # Supported sites - **radlive:channel** - **radlive:season** - **Rai** + - **RaiNews** - **RaiPlay** - **RaiPlayLive** - **RaiPlayPlaylist** - **RaiPlaySound** - **RaiPlaySoundLive** - **RaiPlaySoundPlaylist** + - **RaiSudtirol** - **RayWenderlich** - **RayWenderlichCourse** - **RBMARadio** @@ -1072,7 +1071,7 @@ # Supported sites - **RoosterTeethSeries**: [<abbr title="netrc machine"><em>roosterteeth</em></abbr>] - **RottenTomatoes** - **Rozhlas** - - **RTBF** + - **RTBF**: [<abbr title="netrc machine"><em>rtbf</em></abbr>] - **RTDocumentry** - **RTDocumentryPlaylist** - **rte**: Raidió Teilifís Éireann TV @@ -1144,6 +1143,7 @@ # Supported sites - **Shahid**: [<abbr title="netrc machine"><em>shahid</em></abbr>] - **ShahidShow** - **Shared**: shared.sx + - **ShareVideosEmbed** - **ShemarooMe** - **ShowRoomLive** - **simplecast** @@ -1268,6 +1268,7 @@ # Supported sites - **TeleQuebecVideo** - **TeleTask** - **Telewebion** + - **Tempo** - **TennisTV**: [<abbr title="netrc machine"><em>tennistv</em></abbr>] - **TenPlay**: [<abbr title="netrc machine"><em>10play</em></abbr>] - **TF1** From f1e2d4a9a21a17c0cc8132b248b81092aeb88206 Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Mon, 8 Aug 2022 22:15:24 +0000 Subject: [PATCH 016/284] [version] update Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++++---- .github/ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++++---- yt_dlp/version.py | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 611e232b56..cf2ce93f01 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **2022.07.18** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -62,7 +62,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.07.18 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -70,8 +70,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.07.18, Current version: 2022.07.18 - yt-dlp is up to date (2022.07.18) + Latest version: 2022.08.08, Current version: 2022.08.08 + yt-dlp is up to date (2022.08.08) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ace41816b6..8b94a7e9ef 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2022.07.18** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -74,7 +74,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.07.18 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -82,8 +82,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.07.18, Current version: 2022.07.18 - yt-dlp is up to date (2022.07.18) + Latest version: 2022.08.08, Current version: 2022.08.08 + yt-dlp is up to date (2022.08.08) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 24fbfee931..4c1e1b9235 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2022.07.18** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -70,7 +70,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.07.18 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -78,8 +78,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.07.18, Current version: 2022.07.18 - yt-dlp is up to date (2022.07.18) + Latest version: 2022.08.08, Current version: 2022.08.08 + yt-dlp is up to date (2022.08.08) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index f10339cd81..4d9c6c5799 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2022.07.18** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -55,7 +55,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.07.18 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -63,8 +63,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.07.18, Current version: 2022.07.18 - yt-dlp is up to date (2022.07.18) + Latest version: 2022.08.08, Current version: 2022.08.08 + yt-dlp is up to date (2022.08.08) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 464a3e23a5..4ab6df8062 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.07.18** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -51,7 +51,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.07.18 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -59,7 +59,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.07.18, Current version: 2022.07.18 - yt-dlp is up to date (2022.07.18) + Latest version: 2022.08.08, Current version: 2022.08.08 + yt-dlp is up to date (2022.08.08) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 0498e9af1b..2cfd49f3da 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.07.18** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.07.18 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,7 +65,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.07.18, Current version: 2022.07.18 - yt-dlp is up to date (2022.07.18) + Latest version: 2022.08.08, Current version: 2022.08.08 + yt-dlp is up to date (2022.08.08) <more lines> render: shell diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 75ede4973c..955970a2f8 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.07.18' +__version__ = '2022.08.08' -RELEASE_GIT_HEAD = '135f05ef6' +RELEASE_GIT_HEAD = '3157158f7' VARIANT = None From 81e019599835fdb76e661c4b54043eea4ebffff4 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 9 Aug 2022 03:58:20 +0530 Subject: [PATCH 017/284] [build] Fix changelog Bug in c4b6c5c7c9eb0aa448d03c1540580cdd92737aa8 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bd343d95d3..efacecd3c9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -257,7 +257,7 @@ jobs: - name: Get Changelog run: | - changelog=$(grep -oPz '(?s)(?<=### ${{ steps.bump_version.outputs.ytdlp_version }}\n{2}).+?(?=\n{2,3}###)' Changelog.md) || true + changelog=$(grep -oPz '(?s)(?<=### ${{ needs.prepare.outputs.ytdlp_version }}\n{2}).+?(?=\n{2,3}###)' Changelog.md) || true echo "changelog<<EOF" >> $GITHUB_ENV echo "$changelog" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV From c220d9efc892a5d94feaeb803e5f5f0a85fd2146 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 9 Aug 2022 04:15:37 +0530 Subject: [PATCH 018/284] [ffmpeg] Disable avconv unless `--prefer-avconv` --- yt_dlp/postprocessor/ffmpeg.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 45f7ab32ea..f663cc28e2 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -109,7 +109,8 @@ def _determine_executables(self): return {p: p for p in programs} if not os.path.exists(location): - self.report_warning(f'ffmpeg-location {location} does not exist! Continuing without ffmpeg') + self.report_warning( + f'ffmpeg-location {location} does not exist! Continuing without ffmpeg', only_once=True) return {} elif os.path.isdir(location): dirname, basename = location, None @@ -171,9 +172,9 @@ def probe_basename(self): return self.probe_basename def _get_version(self, kind): - executables = (kind, self._ffmpeg_to_avconv[kind]) + executables = (kind, ) if not self._prefer_ffmpeg: - executables = reversed(executables) + executables = (kind, self._ffmpeg_to_avconv[kind]) basename, version, features = next(filter( lambda x: x[1], ((p, *self._get_ffmpeg_version(p)) for p in executables)), (None, None, {})) if kind == 'ffmpeg': From b5e9a641f537470c8f6fe9d87a33f808c7a9cabb Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 9 Aug 2022 05:30:11 +0530 Subject: [PATCH 019/284] [postprocessor/embedthumbnail] Detect libatomicparsley.so --- yt_dlp/postprocessor/embedthumbnail.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 606d90d3d9..9ae59a7c31 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -139,7 +139,8 @@ def run(self, info): if not success: success = True atomicparsley = next(( - x for x in ['AtomicParsley', 'atomicparsley'] + # libatomicparsley.so : See https://github.com/xibr/ytdlp-lazy/issues/1 + x for x in ['AtomicParsley', 'atomicparsley', 'libatomicparsley.so'] if check_executable(x, ['-v'])), None) if atomicparsley is None: self.to_screen('Neither mutagen nor AtomicParsley was found. Falling back to ffmpeg') From 8420a4d06370d4a3db0f068f5fc9520406d33c40 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 9 Aug 2022 05:14:51 +0530 Subject: [PATCH 020/284] [ffmpeg] Smarter detection of ffprobe filename --- yt_dlp/postprocessor/ffmpeg.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index f663cc28e2..6a0a8220ba 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -113,15 +113,20 @@ def _determine_executables(self): f'ffmpeg-location {location} does not exist! Continuing without ffmpeg', only_once=True) return {} elif os.path.isdir(location): - dirname, basename = location, None + dirname, basename, filename = location, None, None else: - basename = os.path.splitext(os.path.basename(location))[0] - basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') + filename = os.path.basename(location) + basename = next((p for p in programs if p in filename), 'ffmpeg') dirname = os.path.dirname(os.path.abspath(location)) if basename in self._ffmpeg_to_avconv.keys(): self._prefer_ffmpeg = True paths = {p: os.path.join(dirname, p) for p in programs} + if basename and basename in filename: + for p in programs: + path = os.path.join(dirname, filename.replace(basename, p)) + if os.path.exists(path): + paths[p] = path if basename: paths[basename] = location return paths From 7e798d725ed8337c10bd91c0176265a678c61cf1 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 11 Aug 2022 07:22:36 +0530 Subject: [PATCH 021/284] [extractor] Fix format sorting of `channels` --- README.md | 4 ++-- yt_dlp/extractor/common.py | 4 ++-- yt_dlp/extractor/youtube.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 57848ff795..dd3714ad52 100644 --- a/README.md +++ b/README.md @@ -1542,9 +1542,9 @@ ## Sorting Formats All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. Eg: `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. Eg: `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. Eg: `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. Eg: `filesize~1G` prefers the format with filesize closest to 1 GiB. -The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behaviour can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,codec:vp9.2,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. +The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behaviour can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec:vp9.2,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. -Note that the default has `codec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. dolby vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. +Note that the default has `vcodec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. dolby vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. If your format selector is `worst`, the last item is selected after sorting. This means it will select the format that is worst in all respects. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-f best -S +size,+br,+res,+fps`. diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 8afbc76d16..38c72c2d6e 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1669,8 +1669,8 @@ class FormatSort: regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'channels', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases + 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec', + 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', 'fps', 'fs_approx', 'source', 'id') diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index fc8825b190..b59c8630ae 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3588,7 +3588,8 @@ def feed_entry(name): formats.extend(self._extract_storyboard(player_responses, duration)) # source_preference is lower for throttled/potentially damaged formats - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'channels', 'source', 'codec:vp9.2', 'lang', 'proto')) + self._sort_formats(formats, ( + 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto')) info = { 'id': video_id, From 96623ab5c6cea59c22395a47f00a13d334de6106 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 11 Aug 2022 07:12:20 +0530 Subject: [PATCH 022/284] [devscripts] Fix import Closes #4603 --- devscripts/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 devscripts/__init__.py diff --git a/devscripts/__init__.py b/devscripts/__init__.py new file mode 100644 index 0000000000..750dbdca78 --- /dev/null +++ b/devscripts/__init__.py @@ -0,0 +1 @@ +# Empty file needed to make devscripts.utils properly importable from outside From 1155ecef29187bff975ceb51c755722c660e0387 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 12 Aug 2022 12:50:43 +0530 Subject: [PATCH 023/284] [extractor/zattoo] Fix resellers Fixes #4630 --- yt_dlp/extractor/zattoo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 2a7e854723..975cc71259 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -237,6 +237,10 @@ def _extract_ondemand(self, ondemand_id): ondemand_termtoken=ondemand_termtoken, ondemand_type=ondemand_type) return info_dict + def _real_extract(self, url): + vid1, vid2 = self._match_valid_url(url).group('vid1', 'vid2') + return getattr(self, f'_extract_{self._TYPE}')(vid1 or vid2) + def _make_valid_url(host): return rf'https?://(?:www\.)?{re.escape(host)}/watch/[^/]+?/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' @@ -254,10 +258,6 @@ def _create_valid_url(match, qs, base_re=None): {match_base} )''' - def _real_extract(self, url): - vid1, vid2 = self._match_valid_url(url).group('vid1', 'vid2') - return getattr(self, f'_extract_{self._TYPE}')(vid1 or vid2) - class ZattooIE(ZattooBaseIE): _VALID_URL = ZattooBaseIE._create_valid_url(r'\d+', 'program', '(?:program|watch)/[^/]+') From 5da42f2b9b29e69cff8a2ea22d3cf9c586e470d6 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 12 Aug 2022 13:08:32 +0530 Subject: [PATCH 024/284] [extractor/crunchyroll] Improve `_VALID_URL`s Closes #4633 --- yt_dlp/extractor/crunchyroll.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index fccf054803..d4968c13b2 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -114,7 +114,14 @@ def _add_skip_wall(url): class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?!series/|watch/)(?:[^/]+/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'''(?x) + https?://(?:(?P<prefix>www|m)\.)?(?P<url> + crunchyroll\.(?:com|fr)/(?: + media(?:-|/\?id=)| + (?!series/|watch/)(?:[^/]+/){1,2}[^/?&#]*? + )(?P<id>[0-9]+) + )(?:[/?&#]|$)''' + _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -758,7 +765,11 @@ def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, ie class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{2}(?:-\w{2})?/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' + _VALID_URL = r'''(?x) + https?://beta\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -780,7 +791,7 @@ class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y', 'only_matching': True, }, { 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', @@ -867,7 +878,11 @@ def _real_extract(self, url): class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{2}(?:-\w{2})?/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' + _VALID_URL = r'''(?x) + https?://beta\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + series/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -876,7 +891,7 @@ class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): }, 'playlist_mincount': 10, }, { - 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR', 'only_matching': True, }] From a1c5bd82eccf36ed239d368b86ac46db236ff9b1 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 12 Aug 2022 18:53:53 +0530 Subject: [PATCH 025/284] [jsinterp] Truncate error messages Related: #4635 --- yt_dlp/jsinterp.py | 34 +++++++++++++++++++--------------- yt_dlp/utils.py | 7 +++++++ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index c95a0ff57b..e85371574c 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -4,7 +4,7 @@ import operator import re -from .utils import ExtractorError, remove_quotes +from .utils import ExtractorError, remove_quotes, truncate_string _NAME_RE = r'[a-zA-Z_$][\w$]*' _OPERATORS = { @@ -53,6 +53,12 @@ def __init__(self, code, objects=None): self.code, self._functions = code, {} self._objects = {} if objects is None else objects + class Exception(ExtractorError): + def __init__(self, msg, expr=None, *args, **kwargs): + if expr is not None: + msg += f' in: {truncate_string(expr, 50, 50)}' + super().__init__(msg, *args, **kwargs) + def _named_object(self, namespace, obj): self.__named_object_counter += 1 name = f'__yt_dlp_jsinterp_obj{self.__named_object_counter}' @@ -92,12 +98,12 @@ def _separate(expr, delim=',', max_split=None): def _separate_at_paren(cls, expr, delim): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise ExtractorError(f'No terminating paren {delim} in {expr}') + raise cls.Exception(f'No terminating paren {delim}', expr) return separated[0][1:].strip(), separated[1].strip() def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: - raise ExtractorError('Recursion limit reached') + raise self.Exception('Recursion limit reached') should_abort = False sub_statements = list(self._separate(stmt, ';')) or [''] @@ -177,8 +183,7 @@ def interpret_expression(self, expr, local_vars, allow_recursion): body, expr = remaining, '' start, cndn, increment = self._separate(constructor, ';') if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: - raise ExtractorError( - f'Premature return in the initialization of a for loop in {constructor!r}') + raise self.Exception('Premature return in the initialization of a for loop', constructor) while True: if not self.interpret_expression(cndn, local_vars, allow_recursion): break @@ -191,8 +196,7 @@ def interpret_expression(self, expr, local_vars, allow_recursion): except JS_Continue: pass if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: - raise ExtractorError( - f'Premature return in the initialization of a for loop in {constructor!r}') + raise self.Exception('Premature return in the initialization of a for loop', constructor) return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] elif m and m.group('switch'): @@ -267,11 +271,11 @@ def interpret_expression(self, expr, local_vars, allow_recursion): local_vars[m.group('out')] = opfunc(left_val, right_val) return local_vars[m.group('out')] elif left_val is None: - raise ExtractorError(f'Cannot index undefined variable: {m.group("out")}') + raise self.Exception(f'Cannot index undefined variable {m.group("out")}', expr) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) if not isinstance(idx, int): - raise ExtractorError(f'List indices must be integers: {idx}') + raise self.Exception(f'List index {idx} must be integer', expr) left_val[idx] = opfunc(left_val[idx], right_val) return left_val[idx] @@ -303,11 +307,11 @@ def interpret_expression(self, expr, local_vars, allow_recursion): left_val, should_abort = self.interpret_statement( left_val, local_vars, allow_recursion - 1) if should_abort: - raise ExtractorError(f'Premature left-side return of {op} in {expr!r}') + raise self.Exception(f'Premature left-side return of {op}', expr) right_val, should_abort = self.interpret_statement( right_val, local_vars, allow_recursion - 1) if should_abort: - raise ExtractorError(f'Premature right-side return of {op} in {expr!r}') + raise self.Exception(f'Premature right-side return of {op}', expr) return opfunc(left_val or 0, right_val) if m and m.group('attribute'): @@ -322,7 +326,7 @@ def interpret_expression(self, expr, local_vars, allow_recursion): def assertion(cndn, msg): """ assert, but without risk of getting optimized out """ if not cndn: - raise ExtractorError(f'{member} {msg}: {expr}') + raise self.Exception(f'{member} {msg}', expr) def eval_method(): if variable == 'String': @@ -349,7 +353,7 @@ def eval_method(): if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) - raise ExtractorError(f'Unsupported string method {member}') + raise self.Exception(f'Unsupported string method {member}', expr) if member == 'split': assertion(argvals, 'takes one or more arguments') @@ -430,7 +434,7 @@ def eval_method(): self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) - raise ExtractorError(f'Unsupported JS expression {expr!r}') + raise self.Exception('Unsupported JS expression', expr) def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -469,7 +473,7 @@ def extract_function_code(self, funcname): self.code) code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match if func_m is None: - raise ExtractorError(f'Could not find JS function "{funcname}"') + raise self.Exception(f'Could not find JS function "{funcname}"') return func_m.group('args').split(','), code def extract_function(self, funcname): diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 3a33cad2e7..17d6e73351 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5759,6 +5759,13 @@ def make_archive_id(ie, video_id): return f'{ie_key.lower()} {video_id}' +def truncate_string(s, left, right=0): + assert left > 3 and right >= 0 + if s is None or len(s) <= left + right: + return s + return f'{s[:left-3]}...{s[-right:]}' + + # Deprecated has_certifi = bool(certifi) has_websockets = bool(websockets) From ffcd62c2899a7d0cd4aeceaed922d3d0a6c1c582 Mon Sep 17 00:00:00 2001 From: shirt <shirt@shirt.rip> Date: Fri, 12 Aug 2022 19:40:49 -0400 Subject: [PATCH 026/284] [extractor/tubitv] Extract additional formats (#4646) Authored by: shirt-dev --- yt_dlp/extractor/tubitv.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index ea38162ae3..d91a46500c 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -70,16 +70,17 @@ def _perform_login(self, username, password): def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id) + 'https://tubitv.com/oz/videos/%s/content?video_resources=dash&video_resources=hlsv3&video_resources=hlsv6' % video_id, video_id) title = video_data['title'] formats = [] - url = video_data['url'] - # URL can be sometimes empty. Does this only happen when there is DRM? - if url: - formats = self._extract_m3u8_formats( - self._proto_relative_url(url), - video_id, 'mp4', 'm3u8_native') + + for resource in video_data['video_resources']: + if resource['type'] in ('dash', ): + formats += self._extract_mpd_formats(resource['manifest']['url'], video_id, mpd_id=resource['type'], fatal=False) + elif resource['type'] in ('hlsv3', 'hlsv6'): + formats += self._extract_m3u8_formats(resource['manifest']['url'], video_id, 'mp4', m3u8_id=resource['type'], fatal=False) + self._sort_formats(formats) thumbnails = [] From cea4b857f0019205b6a473b3a053aa36403892ed Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 13 Aug 2022 00:25:20 +0000 Subject: [PATCH 027/284] [patreon] Ignore erroneous media attachments (#4638) Fixes https://github.com/yt-dlp/yt-dlp/issues/4608 Authored by: coletdjnz --- yt_dlp/extractor/patreon.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 95fda3b694..529aba178c 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -154,6 +154,28 @@ class PatreonIE(PatreonBaseIE): 'channel_url': 'https://www.patreon.com/loish', 'channel_follower_count': int, } + }, { + # bad videos under media (if media is included). Real one is under post_file + 'url': 'https://www.patreon.com/posts/premium-access-70282931', + 'info_dict': { + 'id': '70282931', + 'ext': 'mp4', + 'title': '[Premium Access + Uncut] The Office - 2x6 The Fight - Group Reaction', + 'channel_url': 'https://www.patreon.com/thenormies', + 'channel_id': '573397', + 'uploader_id': '2929435', + 'uploader': 'The Normies', + 'description': 'md5:79c9fd8778e2cef84049a94c058a5e23', + 'comment_count': int, + 'upload_date': '20220809', + 'thumbnail': r're:^https?://.*$', + 'channel_follower_count': int, + 'like_count': int, + 'timestamp': 1660052820, + 'tags': ['The Office', 'early access', 'uncut'], + 'uploader_url': 'https://www.patreon.com/thenormies', + }, + 'skip': 'Patron-only content', }] def _real_extract(self, url): @@ -166,7 +188,7 @@ def _real_extract(self, url): 'fields[post_tag]': 'value', 'fields[campaign]': 'url,name,patron_count', 'json-api-use-default-includes': 'false', - 'include': 'media,user,user_defined_tags,campaign', + 'include': 'audio,user,user_defined_tags,campaign,attachments_media', }) attributes = post['data']['attributes'] title = attributes['title'].strip() @@ -190,11 +212,16 @@ def _real_extract(self, url): media_attributes = i.get('attributes') or {} download_url = media_attributes.get('download_url') ext = mimetype2ext(media_attributes.get('mimetype')) - if download_url and ext in KNOWN_EXTENSIONS: + + # if size_bytes is None, this media file is likely unavailable + # See: https://github.com/yt-dlp/yt-dlp/issues/4608 + size_bytes = int_or_none(media_attributes.get('size_bytes')) + if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: + # XXX: what happens if there are multiple attachments? return { **info, 'ext': ext, - 'filesize': int_or_none(media_attributes.get('size_bytes')), + 'filesize': size_bytes, 'url': download_url, } elif i_type == 'user': From 1cddfdc52b39f6760a70869632d12577b080b69c Mon Sep 17 00:00:00 2001 From: Jacob Truman <jacob.truman@gmail.com> Date: Sat, 13 Aug 2022 11:26:41 -0600 Subject: [PATCH 028/284] [extractor/aenetworks] Add formats parameter (#4645) Closes #4047 Authored by: jacobtruman --- yt_dlp/extractor/aenetworks.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index 86a10f2dcd..516cb6302c 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -28,14 +28,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', From 8f53dc44a0cc1c2d98c35740b9293462c080f5d0 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 14 Aug 2022 04:51:54 +0530 Subject: [PATCH 029/284] [jsinterp] Handle new youtube signature functions Closes #4635 --- test/test_jsinterp.py | 29 ++- test/test_utils.py | 4 + test/test_youtube_signature.py | 8 + yt_dlp/extractor/youtube.py | 3 +- yt_dlp/jsinterp.py | 339 ++++++++++++++++++++++----------- yt_dlp/utils.py | 29 ++- 6 files changed, 287 insertions(+), 125 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 4277cabe02..48e2abcf66 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -19,6 +19,9 @@ def test_basic(self): jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) + jsi = JSInterpreter('function x3(){42}') + self.assertEqual(jsi.call_function('x3'), None) + jsi = JSInterpreter('var x5 = function(){return 42;}') self.assertEqual(jsi.call_function('x5'), 42) @@ -51,8 +54,11 @@ def test_operators(self): jsi = JSInterpreter('function f(){return 11 >> 2;}') self.assertEqual(jsi.call_function('f'), 2) + jsi = JSInterpreter('function f(){return []? 2+3: 4;}') + self.assertEqual(jsi.call_function('f'), 5) + def test_array_access(self): - jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2] = 7; return x;}') + jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) def test_parens(self): @@ -62,6 +68,10 @@ def test_parens(self): jsi = JSInterpreter('function f(){return (1 + 2) * 3;}') self.assertEqual(jsi.call_function('f'), 9) + def test_quotes(self): + jsi = JSInterpreter(R'function f(){return "a\"\\("}') + self.assertEqual(jsi.call_function('f'), R'a"\(') + def test_assignments(self): jsi = JSInterpreter('function f(){var x = 20; x = 30 + 1; return x;}') self.assertEqual(jsi.call_function('f'), 31) @@ -107,14 +117,15 @@ def test_precedence(self): def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } - function y(a) { return x() + a; } + function y(a) { return x() + (a?a:0); } function z() { return y(3); } ''') self.assertEqual(jsi.call_function('z'), 5) + self.assertEqual(jsi.call_function('y'), 2) def test_for_loop(self): jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) {a++} a } + function x() { a=0; for (i=0; i-10; i++) {a++} return a } ''') self.assertEqual(jsi.call_function('x'), 10) @@ -155,19 +166,19 @@ def test_try(self): def test_for_loop_continue(self): jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a } + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } ''') self.assertEqual(jsi.call_function('x'), 0) def test_for_loop_break(self): jsi = JSInterpreter(''' - function x() { a=0; for (i=0; i-10; i++) { break; a++ } a } + function x() { a=0; for (i=0; i-10; i++) { break; a++ } return a } ''') self.assertEqual(jsi.call_function('x'), 0) def test_literal_list(self): jsi = JSInterpreter(''' - function x() { [1, 2, "asdf", [5, 6, 7]][3] } + function x() { return [1, 2, "asdf", [5, 6, 7]][3] } ''') self.assertEqual(jsi.call_function('x'), [5, 6, 7]) @@ -177,6 +188,12 @@ def test_comma(self): ''') self.assertEqual(jsi.call_function('x'), 7) + def test_return_function(self): + jsi = JSInterpreter(''' + function x() { return [1, function(){return 1}][1] } + ''') + self.assertEqual(jsi.call_function('x')([]), 1) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 659b071d3c..67cd966d8e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -413,6 +413,10 @@ def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) + self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) + self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78) + def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4fc2917e59..559bdfccff 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -94,6 +94,14 @@ 'https://www.youtube.com/s/player/5dd88d1d/player-plasma-ias-phone-en_US.vflset/base.js', 'kSxKFLeqzv_ZyHSAt', 'n8gS8oRlHOxPFA', ), + ( + 'https://www.youtube.com/s/player/324f67b9/player_ias.vflset/en_US/base.js', + 'xdftNy7dh9QGnhW', '22qLGxrmX8F1rA', + ), + ( + 'https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', + 'TDCstCG66tEAO5pR9o', 'dbxNtZ14c-yWyw', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b59c8630ae..ef289e48ce 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2653,7 +2653,8 @@ def _extract_n_function(self, video_id, player_url): if self.get_param('youtube_print_sig_code'): self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') - return lambda s: jsi.extract_function_from_code(*func_code)([s]) + func = jsi.extract_function_from_code(*func_code) + return lambda s: func([s]) def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index e85371574c..1af6ee0aa2 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -1,29 +1,62 @@ import collections import contextlib +import itertools import json +import math import operator import re -from .utils import ExtractorError, remove_quotes, truncate_string +from .utils import ( + NO_DEFAULT, + ExtractorError, + js_to_json, + remove_quotes, + truncate_string, + unified_timestamp, + write_string, +) _NAME_RE = r'[a-zA-Z_$][\w$]*' -_OPERATORS = { +_OPERATORS = { # None => Defined in JSInterpreter._operator + '?': None, + + '||': None, + '&&': None, + '&': operator.and_, '|': operator.or_, '^': operator.xor, - '&': operator.and_, + + # FIXME: This should actually be below comparision '>>': operator.rshift, '<<': operator.lshift, - '-': operator.sub, + + '<=': operator.le, + '>=': operator.ge, + '<': operator.lt, + '>': operator.gt, + '+': operator.add, - '%': operator.mod, - '/': operator.truediv, + '-': operator.sub, + '*': operator.mul, + '/': operator.truediv, + '%': operator.mod, } _MATCHING_PARENS = dict(zip('({[', ')}]')) _QUOTES = '\'"' +def _ternary(cndn, if_true=True, if_false=False): + """Simulate JS's ternary operator (cndn?if_true:if_false)""" + if cndn in (False, None, 0, ''): + return if_false + with contextlib.suppress(TypeError): + if math.isnan(cndn): # NB: NaN cannot be checked by membership + return if_false + return if_true + + class JS_Break(ExtractorError): def __init__(self): ExtractorError.__init__(self, 'Invalid break') @@ -46,6 +79,27 @@ def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') +class Debugger: + import sys + ENABLED = 'pytest' in sys.modules + + @staticmethod + def write(*args, level=100): + write_string(f'[debug] JS: {" " * (100 - level)}' + f'{" ".join(truncate_string(str(x), 50, 50) for x in args)}\n') + + @classmethod + def wrap_interpreter(cls, f): + def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs): + if cls.ENABLED and stmt.strip(): + cls.write(stmt, level=allow_recursion) + ret, should_ret = f(self, stmt, local_vars, allow_recursion, *args, **kwargs) + if cls.ENABLED and stmt.strip(): + cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) + return ret, should_ret + return interpret_statement + + class JSInterpreter: __named_object_counter = 0 @@ -56,7 +110,7 @@ def __init__(self, code, objects=None): class Exception(ExtractorError): def __init__(self, msg, expr=None, *args, **kwargs): if expr is not None: - msg += f' in: {truncate_string(expr, 50, 50)}' + msg = f'{msg.rstrip()} in: {truncate_string(expr, 50, 50)}' super().__init__(msg, *args, **kwargs) def _named_object(self, namespace, obj): @@ -73,9 +127,9 @@ def _separate(expr, delim=',', max_split=None): start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping = None, False for idx, char in enumerate(expr): - if char in _MATCHING_PARENS: + if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 - elif char in counters: + elif not in_quote and char in counters: counters[char] -= 1 elif not escaping and char in _QUOTES and in_quote in (char, None): in_quote = None if in_quote else char @@ -101,50 +155,91 @@ def _separate_at_paren(cls, expr, delim): raise cls.Exception(f'No terminating paren {delim}', expr) return separated[0][1:].strip(), separated[1].strip() + def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): + if op in ('||', '&&'): + if (op == '&&') ^ _ternary(left_val): + return left_val # short circuiting + elif op == '?': + right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) + + right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) + if not _OPERATORS.get(op): + return right_val + + try: + return _OPERATORS[op](left_val, right_val) + except Exception as e: + raise self.Exception(f'Failed to evaluate {left_val!r} {op} {right_val!r}', expr, cause=e) + + def _index(self, obj, idx): + if idx == 'length': + return len(obj) + try: + return obj[int(idx)] if isinstance(obj, list) else obj[idx] + except Exception as e: + raise self.Exception(f'Cannot get index {idx}', repr(obj), cause=e) + + def _dump(self, obj, namespace): + try: + return json.dumps(obj) + except TypeError: + return self._named_object(namespace, obj) + + @Debugger.wrap_interpreter def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise self.Exception('Recursion limit reached') + allow_recursion -= 1 - should_abort = False + should_return = False sub_statements = list(self._separate(stmt, ';')) or [''] - stmt = sub_statements.pop().lstrip() + expr = stmt = sub_statements.pop().strip() for sub_stmt in sub_statements: - ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) - if should_abort: - return ret, should_abort + ret, should_return = self.interpret_statement(sub_stmt, local_vars, allow_recursion) + if should_return: + return ret, should_return m = re.match(r'(?P<var>var\s)|return(?:\s+|$)', stmt) - if not m: # Try interpreting it as an expression - expr = stmt - elif m.group('var'): - expr = stmt[len(m.group(0)):] - else: - expr = stmt[len(m.group(0)):] - should_abort = True - - return self.interpret_expression(expr, local_vars, allow_recursion), should_abort - - def interpret_expression(self, expr, local_vars, allow_recursion): - expr = expr.strip() + if m: + expr = stmt[len(m.group(0)):].strip() + should_return = not m.group('var') if not expr: - return None + return None, should_return + + if expr[0] in _QUOTES: + inner, outer = self._separate(expr, expr[0], 1) + inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) + if not outer: + return inner, should_return + expr = self._named_object(local_vars, inner) + outer + + if expr.startswith('new '): + obj = expr[4:] + if obj.startswith('Date('): + left, right = self._separate_at_paren(obj[4:], ')') + expr = unified_timestamp(left[1:-1], False) + if not expr: + raise self.Exception(f'Failed to parse date {left!r}', expr) + expr = self._dump(int(expr * 1000), local_vars) + right + else: + raise self.Exception(f'Unsupported object {obj}', expr) if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') - inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1) + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: - return inner + return inner, should_abort or should_return else: - expr = json.dumps(inner) + outer + expr = self._dump(inner, local_vars) + outer if expr.startswith('('): inner, outer = self._separate_at_paren(expr, ')') - inner = self.interpret_expression(inner, local_vars, allow_recursion) - if not outer: - return inner + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + if not outer or should_abort: + return inner, should_abort or should_return else: - expr = json.dumps(inner) + outer + expr = self._dump(inner, local_vars) + outer if expr.startswith('['): inner, outer = self._separate_at_paren(expr, ']') @@ -153,21 +248,23 @@ def interpret_expression(self, expr, local_vars, allow_recursion): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'(?P<try>try)\s*|(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) + m = re.match(r'(?P<try>try|finally)\s*|(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) if m and m.group('try'): if expr[m.end()] == '{': try_expr, expr = self._separate_at_paren(expr[m.end():], '}') else: try_expr, expr = expr[m.end() - 1:], '' - ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) if should_abort: - return ret - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + return ret, True + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return elif m and m.group('catch'): # We ignore the catch block _, expr = self._separate_at_paren(expr, '}') - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return elif m and m.group('for'): constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') @@ -182,22 +279,21 @@ def interpret_expression(self, expr, local_vars, allow_recursion): else: body, expr = remaining, '' start, cndn, increment = self._separate(constructor, ';') - if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: - raise self.Exception('Premature return in the initialization of a for loop', constructor) + self.interpret_expression(start, local_vars, allow_recursion) while True: - if not self.interpret_expression(cndn, local_vars, allow_recursion): + if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): break try: - ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) if should_abort: - return ret + return ret, True except JS_Break: break except JS_Continue: pass - if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: - raise self.Exception('Premature return in the initialization of a for loop', constructor) - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + self.interpret_expression(increment, local_vars, allow_recursion) + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return elif m and m.group('switch'): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') @@ -215,20 +311,23 @@ def interpret_expression(self, expr, local_vars, allow_recursion): if not matched: continue try: - ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion) if should_abort: return ret except JS_Break: break if matched: break - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return # Comma separated statements sub_expressions = list(self._separate(expr)) expr = sub_expressions.pop().strip() if sub_expressions else '' for sub_expr in sub_expressions: - self.interpret_expression(sub_expr, local_vars, allow_recursion) + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True for m in re.finditer(rf'''(?x) (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})| @@ -240,10 +339,10 @@ def interpret_expression(self, expr, local_vars, allow_recursion): local_vars[var] += 1 if sign[0] == '+' else -1 if m.group('pre_sign'): ret = local_vars[var] - expr = expr[:start] + json.dumps(ret) + expr[end:] + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] if not expr: - return None + return None, should_return m = re.match(fr'''(?x) (?P<assign> @@ -251,36 +350,34 @@ def interpret_expression(self, expr, local_vars, allow_recursion): (?P<op>{"|".join(map(re.escape, _OPERATORS))})? =(?P<expr>.*)$ )|(?P<return> - (?!if|return|true|false|null)(?P<name>{_NAME_RE})$ + (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ )|(?P<indexing> (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ )|(?P<attribute> (?P<var>{_NAME_RE})(?:\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* )|(?P<function> - (?P<fname>{_NAME_RE})\((?P<args>[\w$,]*)\)$ + (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ )''', expr) if m and m.group('assign'): - if not m.group('op'): - opfunc = lambda curr, right: right - else: - opfunc = _OPERATORS[m.group('op')] - right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) left_val = local_vars.get(m.group('out')) if not m.group('index'): - local_vars[m.group('out')] = opfunc(left_val, right_val) - return local_vars[m.group('out')] + local_vars[m.group('out')] = self._operator( + m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) + return local_vars[m.group('out')], should_return elif left_val is None: raise self.Exception(f'Cannot index undefined variable {m.group("out")}', expr) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) - if not isinstance(idx, int): + if not isinstance(idx, (int, float)): raise self.Exception(f'List index {idx} must be integer', expr) - left_val[idx] = opfunc(left_val[idx], right_val) - return left_val[idx] + idx = int(idx) + left_val[idx] = self._operator( + m.group('op'), left_val[idx], m.group('expr'), expr, local_vars, allow_recursion) + return left_val[idx], should_return elif expr.isdigit(): - return int(expr) + return int(expr), should_return elif expr == 'break': raise JS_Break() @@ -288,35 +385,33 @@ def interpret_expression(self, expr, local_vars, allow_recursion): raise JS_Continue() elif m and m.group('return'): - return local_vars[m.group('name')] + return local_vars[m.group('name')], should_return with contextlib.suppress(ValueError): - return json.loads(expr) + return json.loads(js_to_json(expr, strict=True)), should_return if m and m.group('indexing'): val = local_vars[m.group('in')] idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) - return val[idx] + return self._index(val, idx), should_return - for op, opfunc in _OPERATORS.items(): + for op in _OPERATORS: separated = list(self._separate(expr, op)) if len(separated) < 2: continue - right_val = separated.pop() - left_val = op.join(separated) - left_val, should_abort = self.interpret_statement( - left_val, local_vars, allow_recursion - 1) - if should_abort: - raise self.Exception(f'Premature left-side return of {op}', expr) - right_val, should_abort = self.interpret_statement( - right_val, local_vars, allow_recursion - 1) - if should_abort: - raise self.Exception(f'Premature right-side return of {op}', expr) - return opfunc(left_val or 0, right_val) + right_expr = separated.pop() + while op == '-' and len(separated) > 1 and not separated[-1].strip(): + right_expr = f'-{right_expr}' + separated.pop() + left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) + return self._operator(op, 0 if left_val is None else left_val, + right_expr, expr, local_vars, allow_recursion), should_return if m and m.group('attribute'): variable = m.group('var') - member = remove_quotes(m.group('member') or m.group('member2')) + member = m.group('member') + if not member: + member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] if arg_str.startswith('('): arg_str, remaining = self._separate_at_paren(arg_str, ')') @@ -329,20 +424,24 @@ def assertion(cndn, msg): raise self.Exception(f'{member} {msg}', expr) def eval_method(): - if variable == 'String': - obj = str - elif variable in local_vars: - obj = local_vars[variable] - else: + if (variable, member) == ('console', 'debug'): + if Debugger.ENABLED: + Debugger.write(self.interpret_expression(f'[{arg_str}]', local_vars, allow_recursion)) + return + + types = { + 'String': str, + 'Math': float, + } + obj = local_vars.get(variable, types.get(variable, NO_DEFAULT)) + if obj is NO_DEFAULT: if variable not in self._objects: self._objects[variable] = self.extract_object(variable) obj = self._objects[variable] # Member access if arg_str is None: - if member == 'length': - return len(obj) - return obj[member] + return self._index(obj, member) # Function call argvals = [ @@ -353,12 +452,17 @@ def eval_method(): if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) - raise self.Exception(f'Unsupported string method {member}', expr) + raise self.Exception(f'Unsupported String method {member}', expr) + elif obj == float: + if member == 'pow': + assertion(len(argvals) == 2, 'takes two arguments') + return argvals[0] ** argvals[1] + raise self.Exception(f'Unsupported Math method {member}', expr) if member == 'split': assertion(argvals, 'takes one or more arguments') - assertion(argvals == [''], 'with arguments is not implemented') - return list(obj) + assertion(len(argvals) == 1, 'with limit argument is not implemented') + return obj.split(argvals[0]) if argvals[0] else list(obj) elif member == 'join': assertion(isinstance(obj, list), 'must be applied on a list') assertion(len(argvals) == 1, 'takes exactly one argument') @@ -404,7 +508,7 @@ def eval_method(): assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') f, this = (argvals + [''])[:2] - return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)] + return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)] elif member == 'indexOf': assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') @@ -414,27 +518,35 @@ def eval_method(): except ValueError: return -1 - return obj[int(member) if isinstance(obj, list) else member](argvals) + idx = int(member) if isinstance(obj, list) else member + return obj[idx](argvals, allow_recursion=allow_recursion) if remaining: - return self.interpret_expression( + ret, should_abort = self.interpret_statement( self._named_object(local_vars, eval_method()) + remaining, local_vars, allow_recursion) + return ret, should_return or should_abort else: - return eval_method() + return eval_method(), should_return elif m and m.group('function'): fname = m.group('fname') - argvals = tuple( - int(v) if v.isdigit() else local_vars[v] - for v in self._separate(m.group('args'))) + argvals = [self.interpret_expression(v, local_vars, allow_recursion) + for v in self._separate(m.group('args'))] if fname in local_vars: - return local_vars[fname](argvals) + return local_vars[fname](argvals, allow_recursion=allow_recursion), should_return elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) - return self._functions[fname](argvals) + return self._functions[fname](argvals, allow_recursion=allow_recursion), should_return - raise self.Exception('Unsupported JS expression', expr) + raise self.Exception( + f'Unsupported JS expression {truncate_string(expr, 20, 20) if expr != stmt else ""}', stmt) + + def interpret_expression(self, expr, local_vars, allow_recursion): + ret, should_return = self.interpret_statement(expr, local_vars, allow_recursion) + if should_return: + raise self.Exception('Cannot return from an expression', expr) + return ret def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -446,6 +558,8 @@ def extract_object(self, objname): }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) + if not obj_m: + raise self.Exception(f'Could not find object {objname}') fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( @@ -462,19 +576,19 @@ def extract_object(self, objname): def extract_function_code(self, funcname): """ @returns argnames, code """ func_m = re.search( - r'''(?x) + r'''(?xs) (?: function\s+%(name)s| [{;,]\s*%(name)s\s*=\s*function| var\s+%(name)s\s*=\s*function )\s* \((?P<args>[^)]*)\)\s* - (?P<code>{(?:(?!};)[^"]|"([^"]|\\")*")+})''' % {'name': re.escape(funcname)}, + (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match + code, _ = self._separate_at_paren(func_m.group('code'), '}') if func_m is None: raise self.Exception(f'Could not find JS function "{funcname}"') - return func_m.group('args').split(','), code + return [x.strip() for x in func_m.group('args').split(',')], code def extract_function(self, funcname): return self.extract_function_from_code(*self.extract_function_code(funcname)) @@ -498,16 +612,15 @@ def call_function(self, funcname, *args): def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] + argnames = tuple(argnames) - def resf(args, **kwargs): + def resf(args, kwargs={}, allow_recursion=100): global_stack[0].update({ - **dict(zip(argnames, args)), + **dict(itertools.zip_longest(argnames, args, fillvalue=None)), **kwargs }) var_stack = LocalNameSpace(*global_stack) - for stmt in self._separate(code.replace('\n', ''), ';'): - ret, should_abort = self.interpret_statement(stmt, var_stack) - if should_abort: - break - return ret + ret, should_abort = self.interpret_statement(code.replace('\n', ''), var_stack, allow_recursion - 1) + if should_abort: + return ret return resf diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 17d6e73351..39a41d5b8a 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -150,6 +150,16 @@ def random_user_agent(): 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], } +# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42 +TIMEZONE_NAMES = { + 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0, + 'AST': -4, 'ADT': -3, # Atlantic (used in Canada) + 'EST': -5, 'EDT': -4, # Eastern + 'CST': -6, 'CDT': -5, # Central + 'MST': -7, 'MDT': -6, # Mountain + 'PST': -8, 'PDT': -7 # Pacific +} + # needed for sanitizing filenames in restricted mode ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], @@ -1684,7 +1694,11 @@ def extract_timezone(date_str): $) ''', date_str) if not m: - timezone = datetime.timedelta() + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) + if timezone is not None: + date_str = date_str[:-len(m.group('tz'))] + timezone = datetime.timedelta(hours=timezone or 0) else: date_str = date_str[:-len(m.group('tz'))] if not m.group('sign'): @@ -1746,7 +1760,8 @@ def unified_timestamp(date_str, day_first=True): if date_str is None: return None - date_str = re.sub(r'[,|]', '', date_str) + date_str = re.sub(r'\s+', ' ', re.sub( + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -1768,9 +1783,10 @@ def unified_timestamp(date_str, day_first=True): with contextlib.suppress(ValueError): dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) return calendar.timegm(dt.timetuple()) + timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 + return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() def determine_ext(url, default_ext='unknown_video'): @@ -3199,7 +3215,7 @@ def strip_jsonp(code): r'\g<callback_data>', code) -def js_to_json(code, vars={}): +def js_to_json(code, vars={}, *, strict=False): # vars is a dict of var, val pairs to substitute COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' @@ -3233,14 +3249,17 @@ def fix_kv(m): if v in vars: return vars[v] + if strict: + raise ValueError(f'Unknown value: {v}') return '"%s"' % v def create_map(mobj): return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) - code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) + if not strict: + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| From 62b58c0936cccc6f3e5115086406c7bfaf6fc551 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Sun, 14 Aug 2022 21:04:13 +0900 Subject: [PATCH 030/284] [docs] Consistent use of `e.g.` (#4643) Authored by: Lesmiscore --- CONTRIBUTING.md | 4 +- Changelog.md | 6 +- README.md | 137 +++++++++++++++++------------------ supportedsites.md | 2 +- yt_dlp/YoutubeDL.py | 12 +-- yt_dlp/downloader/f4m.py | 2 +- yt_dlp/extractor/abematv.py | 2 +- yt_dlp/extractor/common.py | 20 ++--- yt_dlp/extractor/generic.py | 2 +- yt_dlp/extractor/openload.py | 2 +- yt_dlp/extractor/youtube.py | 6 +- yt_dlp/minicurses.py | 2 +- yt_dlp/options.py | 54 +++++++------- yt_dlp/utils.py | 6 +- 14 files changed, 128 insertions(+), 129 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d9546033c..d9d5f47304 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -195,7 +195,7 @@ ## Adding support for a new site # * A value # * MD5 checksum; start the string with md5: # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + # * Any Python type, e.g. int or float } }] @@ -261,7 +261,7 @@ ### Mandatory and optional metafields For pornographic sites, appropriate `age_limit` must also be returned. -The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract usefull information with `--ignore-no-formats-error` - Eg: when the video is a live stream that has not started yet. +The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract usefull information with `--ignore-no-formats-error` - e.g. when the video is a live stream that has not started yet. [Any field](yt_dlp/extractor/common.py#219-L426) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. diff --git a/Changelog.md b/Changelog.md index bed128c3d2..483c947b60 100644 --- a/Changelog.md +++ b/Changelog.md @@ -211,7 +211,7 @@ ### 2022.06.22 * [**Deprecate support for Python 3.6**](https://github.com/yt-dlp/yt-dlp/issues/3764#issuecomment-1154051119) * **Add option `--download-sections` to download video partially** - * Chapter regex and time ranges are accepted (Eg: `--download-sections *1:10-2:20`) + * Chapter regex and time ranges are accepted, e.g. `--download-sections *1:10-2:20` * Add option `--alias` * Add option `--lazy-playlist` to process entries as they are received * Add option `--retry-sleep` @@ -1375,7 +1375,7 @@ ### 2021.09.25 * Add new option `--netrc-location` * [outtmpl] Allow alternate fields using `,` -* [outtmpl] Add format type `B` to treat the value as bytes (eg: to limit the filename to a certain number of bytes) +* [outtmpl] Add format type `B` to treat the value as bytes, e.g. to limit the filename to a certain number of bytes * Separate the options `--ignore-errors` and `--no-abort-on-error` * Basic framework for simultaneous download of multiple formats by [nao20010128nao](https://github.com/nao20010128nao) * [17live] Add 17.live extractor by [nao20010128nao](https://github.com/nao20010128nao) @@ -1765,7 +1765,7 @@ ### 2021.07.07 * Merge youtube-dl: Upto [commit/a803582](https://github.com/ytdl-org/youtube-dl/commit/a8035827177d6b59aca03bd717acb6a9bdd75ada) * Add `--extractor-args` to pass some extractor-specific arguments. See [readme](https://github.com/yt-dlp/yt-dlp#extractor-arguments) - * Add extractor option `skip` for `youtube`. Eg: `--extractor-args youtube:skip=hls,dash` + * Add extractor option `skip` for `youtube`, e.g. `--extractor-args youtube:skip=hls,dash` * Deprecates `--youtube-skip-dash-manifest`, `--youtube-skip-hls-manifest`, `--youtube-include-dash-manifest`, `--youtube-include-hls-manifest` * Allow `--list...` options to work with `--print`, `--quiet` and other `--list...` options * [youtube] Use `player` API for additional video extraction requests by [coletdjnz](https://github.com/coletdjnz) diff --git a/README.md b/README.md index dd3714ad52..9672a17718 100644 --- a/README.md +++ b/README.md @@ -376,7 +376,7 @@ ## General Options: --extractor-descriptions Output descriptions of all supported extractors and exit --force-generic-extractor Force extraction to use the generic extractor - --default-search PREFIX Use this prefix for unqualified URLs. Eg: + --default-search PREFIX Use this prefix for unqualified URLs. E.g. "gvsearch2:python" downloads two videos from google videos for the search term "python". Use the value "auto" to let yt-dlp guess @@ -425,7 +425,7 @@ ## General Options: an alias starts with a dash "-", it is prefixed with "--". Arguments are parsed according to the Python string formatting - mini-language. Eg: --alias get-audio,-X + mini-language. E.g. --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options "--get-audio" and "-X" that takes an argument (ARG0) and expands to @@ -439,10 +439,10 @@ ## General Options: ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. To - enable SOCKS proxy, specify a proper scheme. - Eg: socks5://user:pass@127.0.0.1:1080/. Pass - in an empty string (--proxy "") for direct - connection + enable SOCKS proxy, specify a proper scheme, + e.g. socks5://user:pass@127.0.0.1:1080/. + Pass in an empty string (--proxy "") for + direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds --source-address IP Client-side IP address to bind to -4, --force-ipv4 Make all connections via IPv4 @@ -471,17 +471,17 @@ ## Video Selection: compatibility, START-STOP is also supported. Use negative indices to count from the right and negative STEP to download in reverse - order. Eg: "-I 1:3,7,-5::2" used on a + order. E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15 - --min-filesize SIZE Do not download any videos smaller than SIZE - (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) + --min-filesize SIZE Do not download any videos smaller than + SIZE, e.g. 50k or 44.6M + --max-filesize SIZE Do not download any videos larger than SIZE, + e.g. 50k or 44.6M --date DATE Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format [now|today|yesterday][-N[day|week|month|year]]. - Eg: --date today-2weeks + E.g. --date today-2weeks --datebefore DATE Download only videos uploaded on or before this date. The date formats accepted is the same as --date @@ -498,7 +498,7 @@ ## Video Selection: conditions. Use a "\" to escape "&" or quotes if needed. If used multiple times, the filter matches if atleast one of the - conditions are met. Eg: --match-filter + conditions are met. E.g. --match-filter !is_live --match-filter "like_count>?100 & description~='(?i)\bcats \& dogs\b'" matches only videos that are not live OR those that @@ -536,11 +536,11 @@ ## Download Options: -N, --concurrent-fragments N Number of fragments of a dash/hlsnative video that should be downloaded concurrently (default is 1) - -r, --limit-rate RATE Maximum download rate in bytes per second - (e.g. 50K or 4.2M) + -r, --limit-rate RATE Maximum download rate in bytes per second, + e.g. 50K or 4.2M --throttled-rate RATE Minimum download rate in bytes per second below which throttling is assumed and the - video data is re-extracted (e.g. 100K) + video data is re-extracted, e.g. 100K -R, --retries RETRIES Number of retries (default is 10), or "infinite" --file-access-retries RETRIES Number of times to retry on file access @@ -554,7 +554,7 @@ ## Download Options: be a number, linear=START[:END[:STEP=1]] or exp=START[:END[:BASE=2]]. This option can be used multiple times to set the sleep for the - different retry types. Eg: --retry-sleep + different retry types, e.g. --retry-sleep linear=1::2 --retry-sleep fragment:exp=1:20 --skip-unavailable-fragments Skip unavailable fragments for DASH, hlsnative and ISM downloads (default) @@ -566,14 +566,14 @@ ## Download Options: downloading is finished --no-keep-fragments Delete downloaded fragments after downloading is finished (default) - --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) + --buffer-size SIZE Size of download buffer, e.g. 1024 or 16K (default is 1024) --resize-buffer The buffer size is automatically resized from an initial value of --buffer-size (default) --no-resize-buffer Do not automatically adjust the buffer size --http-chunk-size SIZE Size of a chunk for chunk-based HTTP - downloading (e.g. 10485760 or 10M) (default + downloading, e.g. 10485760 or 10M (default is disabled). May be useful for bypassing bandwidth throttling imposed by a webserver (experimental) @@ -598,10 +598,10 @@ ## Download Options: the given regular expression. Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. - Eg: --download-sections "*10:15-15:00" - --download-sections "intro". Needs ffmpeg. - This option can be used multiple times to - download multiple sections + Needs ffmpeg. This option can be used + multiple times to download multiple + sections, e.g. --download-sections + "*10:15-15:00" --download-sections "intro" --downloader [PROTO:]NAME Name or path of the external downloader to use (optionally) prefixed by the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to @@ -609,7 +609,7 @@ ## Download Options: aria2c, avconv, axel, curl, ffmpeg, httpie, wget. You can use this option multiple times to set different downloaders for different - protocols. For example, --downloader aria2c + protocols. E.g. --downloader aria2c --downloader "dash,m3u8:native" will use aria2c for http/ftp downloads, and the native downloader for dash/m3u8 downloads @@ -791,7 +791,7 @@ ## Verbosity and Simulation Options: "postprocess:", or "postprocess-title:". The video's fields are accessible under the "info" key and the progress attributes are - accessible under "progress" key. E.g.: + accessible under "progress" key. E.g. --console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s" -v, --verbose Print various debugging information @@ -860,7 +860,7 @@ ## Video Format Options: -F, --list-formats List available formats of each video. Simulate unless --no-simulate is used --merge-output-format FORMAT Containers that may be used when merging - formats, separated by "/" (Eg: "mp4/mkv"). + formats, separated by "/", e.g. "mp4/mkv". Ignored if no merge is required. (currently supported: avi, flv, mkv, mov, mp4, webm) @@ -874,13 +874,13 @@ ## Subtitle Options: --list-subs List available subtitles of each video. Simulate unless --no-simulate is used --sub-format FORMAT Subtitle format; accepts formats preference, - Eg: "srt" or "ass/srt/best" + e.g. "srt" or "ass/srt/best" --sub-langs LANGS Languages of the subtitles to download (can - be regex) or "all" separated by commas. (Eg: - --sub-langs "en.*,ja") You can prefix the + be regex) or "all" separated by commas, e.g. + --sub-langs "en.*,ja". You can prefix the language code with a "-" to exclude it from - the requested languages. (Eg: --sub-langs - all,-live_chat) Use --list-subs for a list + the requested languages, e.g. --sub-langs + all,-live_chat. Use --list-subs for a list of available language tags ## Authentication Options: @@ -929,7 +929,7 @@ ## Post-Processing Options: m4a, mka, mp3, ogg, opus, vorbis, wav). If target container does not support the video/audio codec, remuxing will fail. You - can specify multiple rules; Eg. + can specify multiple rules; e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv --recode-video FORMAT Re-encode the video into another format if @@ -954,7 +954,7 @@ ## Post-Processing Options: for ffmpeg/ffprobe, "_i"/"_o" can be appended to the prefix optionally followed by a number to pass the argument before the - specified input/output file. Eg: --ppa + specified input/output file, e.g. --ppa "Merger+ffmpeg_i1:-v quiet". You can use this option multiple times to give different arguments to different postprocessors. @@ -1081,7 +1081,7 @@ ## SponsorBlock Options: music_offtopic, poi_highlight, all and default (=all). You can prefix the category with a "-" to exclude it. See [1] for - description of the categories. Eg: + description of the categories. E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories --sponsorblock-remove CATS SponsorBlock categories to be removed from @@ -1140,7 +1140,7 @@ # CONFIGURATION 1. **System Configuration**: `/etc/yt-dlp.conf` -For example, with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: +E.g. with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: ``` # Lines starting with # are comments @@ -1178,7 +1178,7 @@ ### Authentication with `.netrc` file ``` machine <extractor> login <username> password <password> ``` -For example: +E.g. ``` machine youtube login myaccount@gmail.com password my_youtube_password machine twitch login my_twitch_account_name password my_twitch_password @@ -1197,32 +1197,32 @@ # OUTPUT TEMPLATE The simplest usage of `-o` is not to set any template arguments when downloading a single file, like in `yt-dlp -o funny_video.flv "https://some/video"` (hard-coding file extension like this is _not_ recommended and could break some post-processing). -It may however also contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [Python string formatting operations](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. +It may however also contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [Python string formatting operations](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting), e.g. `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. The field names themselves (the part inside the parenthesis) can also have some special formatting: -1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a `.` (dot) separator. You can also do python slicing using `:`. Eg: `%(tags.0)s`, `%(subtitles.en.-1.ext)s`, `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. `%()s` refers to the entire infodict. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields +1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a `.` (dot) separator. You can also do python slicing using `:`. E.g. `%(tags.0)s`, `%(subtitles.en.-1.ext)s`, `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. `%()s` refers to the entire infodict. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields -1. **Addition**: Addition and subtraction of numeric fields can be done using `+` and `-` respectively. Eg: `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d` +1. **Addition**: Addition and subtraction of numeric fields can be done using `+` and `-` respectively. E.g. `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d` -1. **Date/time Formatting**: Date/time fields can be formatted according to [strftime formatting](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) by specifying it separated from the field name using a `>`. Eg: `%(duration>%H-%M-%S)s`, `%(upload_date>%Y-%m-%d)s`, `%(epoch-3600>%H-%M-%S)s` +1. **Date/time Formatting**: Date/time fields can be formatted according to [strftime formatting](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) by specifying it separated from the field name using a `>`. E.g. `%(duration>%H-%M-%S)s`, `%(upload_date>%Y-%m-%d)s`, `%(epoch-3600>%H-%M-%S)s` -1. **Alternatives**: Alternate fields can be specified separated with a `,`. Eg: `%(release_date>%Y,upload_date>%Y|Unknown)s` +1. **Alternatives**: Alternate fields can be specified separated with a `,`. E.g. `%(release_date>%Y,upload_date>%Y|Unknown)s` 1. **Replacement**: A replacement value can specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. -1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s` +1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-template`. E.g. `%(uploader|Unknown)s` -1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, yt-dlp additionally supports converting to `B` = **B**ytes, `j` = **j**son (flag `#` for pretty-printing), `h` = HTML escaping, `l` = a comma separated **l**ist (flag `#` for `\n` newline-separated), `q` = a string **q**uoted for the terminal (flag `#` to split a list into different arguments), `D` = add **D**ecimal suffixes (Eg: 10M) (flag `#` to use 1024 as factor), and `S` = **S**anitize as filename (flag `#` for restricted) +1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, yt-dlp additionally supports converting to `B` = **B**ytes, `j` = **j**son (flag `#` for pretty-printing), `h` = HTML escaping, `l` = a comma separated **l**ist (flag `#` for `\n` newline-separated), `q` = a string **q**uoted for the terminal (flag `#` to split a list into different arguments), `D` = add **D**ecimal suffixes (e.g. 10M) (flag `#` to use 1024 as factor), and `S` = **S**anitize as filename (flag `#` for restricted) -1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. Eg: `%(title)+.100U` is NFKC +1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. E.g. `%(title)+.100U` is NFKC To summarize, the general syntax for a field is: ``` %(name[.keys][addition][>strf][,alternate][&replacement][|default])[flags][width][.precision][length]type ``` -Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation` (deprecated), `infojson`, `link`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`, `pl_video`. For example, `-o "%(title)s.%(ext)s" -o "thumbnail:%(title)s\%(title)s.%(ext)s"` will put the thumbnails in a folder with the same name as the video. If any of the templates is empty, that type of file will not be written. Eg: `--write-thumbnail -o "thumbnail:"` will write thumbnails only for playlists and not for video. +Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation` (deprecated), `infojson`, `link`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`, `pl_video`. E.g. `-o "%(title)s.%(ext)s" -o "thumbnail:%(title)s\%(title)s.%(ext)s"` will put the thumbnails in a folder with the same name as the video. If any of the templates is empty, that type of file will not be written. E.g. `--write-thumbnail -o "thumbnail:"` will write thumbnails only for playlists and not for video. The available fields are: @@ -1358,13 +1358,13 @@ # OUTPUT TEMPLATE - `category_names` (list): Friendly names of the categories - `name` (string): Friendly name of the smallest category -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `yt-dlp test video` and id `BaW_jenozKc`, this will result in a `yt-dlp test video-BaW_jenozKc.mp4` file created in the current directory. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. E.g. for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `yt-dlp test video` and id `BaW_jenozKc`, this will result in a `yt-dlp test video-BaW_jenozKc.mp4` file created in the current directory. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). **Tip**: Look at the `-j` output to identify which fields are available for the particular URL -For numeric sequences you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting), for example, `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. +For numeric sequences you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting); e.g. `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. Output templates can also contain arbitrary hierarchical path, e.g. `-o "%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s"` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. @@ -1434,7 +1434,7 @@ # FORMAT SELECTION **tl;dr:** [navigate me to examples](#format-selection-examples). <!-- MANPAGE: END EXCLUDED SECTION --> -The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. +The simplest case is requesting a specific format; e.g. with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file. @@ -1461,15 +1461,15 @@ # FORMAT SELECTION You can select the n'th best format of a type by using `best<type>.<n>`. For example, `best.2` will select the 2nd best combined format. Similarly, `bv*.3` will select the 3rd best format that contains a video stream. -If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes. Note that formats on the left hand side are preferred, for example `-f 22/17/18` will download format 22 if it's available, otherwise it will download format 17 if it's available, otherwise it will download format 18 if it's available, otherwise it will complain that no suitable formats are available for download. +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes. Note that formats on the left hand side are preferred; e.g. `-f 22/17/18` will download format 22 if it's available, otherwise it will download format 17 if it's available, otherwise it will download format 18 if it's available, otherwise it will complain that no suitable formats are available for download. If you want to download several formats of the same video use a comma as a separator, e.g. `-f 22,17,18` will download all these three formats, of course if they are available. Or a more sophisticated example combined with the precedence feature: `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. -You can merge the video and audio of multiple formats into a single file using `-f <format1>+<format2>+...` (requires ffmpeg installed), for example `-f bestvideo+bestaudio` will download the best video-only format, the best audio-only format and mux them together with ffmpeg. +You can merge the video and audio of multiple formats into a single file using `-f <format1>+<format2>+...` (requires ffmpeg installed); e.g. `-f bestvideo+bestaudio` will download the best video-only format, the best audio-only format and mux them together with ffmpeg. **Deprecation warning**: Since the *below* described behavior is complex and counter-intuitive, this will be removed and multistreams will be enabled by default in the future. A new operator will be instead added to limit formats to single audio/video -Unless `--video-multistreams` is used, all formats with a video stream except the first one are ignored. Similarly, unless `--audio-multistreams` is used, all formats with an audio stream except the first one are ignored. For example, `-f bestvideo+best+bestaudio --video-multistreams --audio-multistreams` will download and merge all 3 given formats. The resulting file will have 2 video streams and 2 audio streams. But `-f bestvideo+best+bestaudio --no-video-multistreams` will download and merge only `bestvideo` and `bestaudio`. `best` is ignored since another format containing a video stream (`bestvideo`) has already been selected. The order of the formats is therefore important. `-f best+bestaudio --no-audio-multistreams` will download and merge both formats while `-f bestaudio+best --no-audio-multistreams` will ignore `best` and download only `bestaudio`. +Unless `--video-multistreams` is used, all formats with a video stream except the first one are ignored. Similarly, unless `--audio-multistreams` is used, all formats with an audio stream except the first one are ignored. E.g. `-f bestvideo+best+bestaudio --video-multistreams --audio-multistreams` will download and merge all 3 given formats. The resulting file will have 2 video streams and 2 audio streams. But `-f bestvideo+best+bestaudio --no-video-multistreams` will download and merge only `bestvideo` and `bestaudio`. `best` is ignored since another format containing a video stream (`bestvideo`) has already been selected. The order of the formats is therefore important. `-f best+bestaudio --no-audio-multistreams` will download and merge both formats while `-f bestaudio+best --no-audio-multistreams` will ignore `best` and download only `bestaudio`. ## Filtering Formats @@ -1500,9 +1500,9 @@ ## Filtering Formats Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering. -Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter. For example, `-f "all[vcodec=none]"` selects all audio-only formats. +Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. -Format selectors can also be grouped using parentheses, for example if you want to download the best pre-merged mp4 and webm formats with a height lower than 480 you can use `-f "(mp4,webm)[height<480]"`. +Format selectors can also be grouped using parentheses; e.g. `-f "(mp4,webm)[height<480]"` will download the best pre-merged mp4 and webm formats with a height lower than 480. ## Sorting Formats @@ -1540,7 +1540,7 @@ ## Sorting Formats **Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names. -All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. Eg: `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. Eg: `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. Eg: `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. Eg: `filesize~1G` prefers the format with filesize closest to 1 GiB. +All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. E.g. `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. E.g. `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. E.g. `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. E.g. `filesize~1G` prefers the format with filesize closest to 1 GiB. The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behaviour can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec:vp9.2,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. @@ -1685,9 +1685,9 @@ # MODIFYING METADATA This option also has a few special uses: -* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. Eg: `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)` will download the first vimeo video found in the description +* You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. E.g. `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)` will download the first vimeo video found in the description -* You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file. For example, you can use this to set a different "description" and "synopsis". To modify the metadata of individual streams, use the `meta<n>_` prefix (Eg: `meta1_language`). Any value set to the `meta_` field will overwrite all default values. +* You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file - you can use this to set a different "description" and "synopsis". To modify the metadata of individual streams, use the `meta<n>_` prefix (e.g. `meta1_language`). Any value set to the `meta_` field will overwrite all default values. **Note**: Metadata modification happens before format selection, post-extraction and other post-processing operations. Some fields may be added or changed during these steps, overriding your changes. @@ -1746,20 +1746,20 @@ # Replace all spaces and "_" in title and uploader with a `-` # EXTRACTOR ARGUMENTS -Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. Eg: `--extractor-args "youtube:player-client=android_embedded,web;include_live_dash" --extractor-args "funimation:version=uncut"` +Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=android_embedded,web;include_live_dash" --extractor-args "funimation:version=uncut"` The following extractors use this feature: #### youtube * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (Eg: `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * `innertube_host`: Innertube API host to use for all API requests - * e.g. `studio.youtube.com`, `youtubei.googleapis.com` + * E.g. `studio.youtube.com`, `youtubei.googleapis.com` * Note: Cookies exported from `www.youtube.com` will not work with hosts other than `*.youtube.com` * `innertube_key`: Innertube API key to use for all API requests @@ -1768,17 +1768,16 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) * `approximate_date`: Extract approximate `upload_date` in flat-playlist. This may cause date-based filters to be slightly off #### funimation -* `language`: Languages to extract. Eg: `funimation:language=english,japanese` +* `language`: Languages to extract, e.g. `funimation:language=english,japanese` * `version`: The video version to extract - `uncut` or `simulcast` #### crunchyroll -* `language`: Languages to extract. Eg: `crunchyroll:language=jaJp` -* `hardsub`: Which hard-sub versions to extract. Eg: `crunchyroll:hardsub=None,enUS` +* `language`: Languages to extract, e.g. `crunchyroll:language=jaJp` +* `hardsub`: Which hard-sub versions to extract, e.g. `crunchyroll:hardsub=None,enUS` #### crunchyrollbeta -* `format`: Which stream type(s) to extract. Default is `adaptive_hls` Eg: `crunchyrollbeta:format=vo_adaptive_hls` - * Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `download_dash`, `multitrack_adaptive_hls_v2` -* `hardsub`: Preference order for which hardsub versions to extract. Default is `None` (no hardsubs). Eg: `crunchyrollbeta:hardsub=en-US,None` +* `format`: Which stream type(s) to extract (default: `adaptive_hls`). Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `download_dash`, `multitrack_adaptive_hls_v2` +* `hardsub`: Preference order for which hardsub versions to extract (default: `None` = no hardsubs), e.g. `crunchyrollbeta:hardsub=en-US,None` #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` @@ -1798,11 +1797,11 @@ #### hotstar * `dr`: dynamic range to ignore - one or more of `sdr`, `hdr10`, `dv` #### tiktok -* `app_version`: App version to call mobile APIs with - should be set along with `manifest_app_version`. (e.g. `20.2.1`) -* `manifest_app_version`: Numeric app version to call mobile APIs with. (e.g. `221`) +* `app_version`: App version to call mobile APIs with - should be set along with `manifest_app_version`, e.g. `20.2.1` +* `manifest_app_version`: Numeric app version to call mobile APIs with, e.g. `221` #### rokfinchannel -* `tab`: Which tab to download. One of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks`. (E.g. `rokfinchannel:tab=streams`) +* `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` NOTE: These options may be changed/removed in the future without concern for backward compatibility @@ -2066,7 +2065,7 @@ #### Not recommended --all-formats -f all --all-subs --sub-langs all --write-subs --print-json -j --no-simulate - --autonumber-size NUMBER Use string formatting. Eg: %(autonumber)03d + --autonumber-size NUMBER Use string formatting, e.g. %(autonumber)03d --autonumber-start NUMBER Use internal field formatting like %(autonumber+NUMBER)s --id -o "%(id)s.%(ext)s" --metadata-from-title FORMAT --parse-metadata "%(title)s:FORMAT" diff --git a/supportedsites.md b/supportedsites.md index be4fecf4aa..e5f808396a 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1584,7 +1584,7 @@ # Supported sites - **youtube:clip** - **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies) - **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies) - - **youtube:music:search_url**: YouTube music search URLs with selectable sections (Eg: #songs) + - **youtube:music:search_url**: YouTube music search URLs with selectable sections, e.g. #songs - **youtube:notif**: YouTube notifications; ":ytnotif" keyword (requires cookies) - **youtube:playlist**: YouTube playlists - **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 2b7af4cd7e..498e8dd8e2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -272,7 +272,7 @@ class YoutubeDL: subtitleslangs: List of languages of the subtitles to download (can be regex). The list may contain "all" to refer to all the available subtitles. The language can be prefixed with a "-" to - exclude it from the requested languages. Eg: ['all', '-live_chat'] + exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file @@ -302,7 +302,7 @@ class YoutubeDL: cookiefile: File name or text stream from where cookies should be read and dumped to cookiesfrombrowser: A tuple containing the name of the browser, the profile name/pathfrom where cookies are loaded, and the name of the - keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + keyring, e.g. ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') legacyserverconnect: Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation nocheckcertificate: Do not verify SSL certificates @@ -470,7 +470,7 @@ class YoutubeDL: discontinuities such as ad breaks (default: False) extractor_args: A dictionary of arguments to be passed to the extractors. See "EXTRACTOR ARGUMENTS" for details. - Eg: {'youtube': {'skip': ['dash', 'hls']}} + E.g. {'youtube': {'skip': ['dash', 'hls']}} mark_watched: Mark videos watched (even with --simulate). Only for YouTube The following options are deprecated and may be removed in the future: @@ -1046,7 +1046,7 @@ def _outtmpl_expandpath(outtmpl): # outtmpl should be expand_path'ed before template dict substitution # because meta fields may contain env variables we don't want to - # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and # title "Hello $PATH", we don't want `$PATH` to be expanded. return expand_path(outtmpl).replace(sep, '') @@ -1977,8 +1977,8 @@ def _parse_filter(tokens): filter_parts.append(string) def _remove_unused_ops(tokens): - # Remove operators that we don't use and join them with the surrounding strings - # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + # Remove operators that we don't use and join them with the surrounding strings. + # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None for type, string, start, end, line in tokens: diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py index 770354de77..a19ab43f15 100644 --- a/yt_dlp/downloader/f4m.py +++ b/yt_dlp/downloader/f4m.py @@ -184,7 +184,7 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: - # In some live HDS streams (for example Rai), `fragments_count` is + # In some live HDS streams (e.g. Rai), `fragments_count` is # abnormal and causing out-of-memory errors. It's OK to change the # number of fragments for live streams as they are updated periodically if fragments_count == 4294967295 and boot_info['live']: diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index d8ad78705c..9955fb289f 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -365,7 +365,7 @@ def _real_extract(self, url): # read breadcrumb on top of page breadcrumb = self._extract_breadcrumb_list(webpage, video_id) if breadcrumb: - # breadcrumb list translates to: (example is 1st test for this IE) + # breadcrumb list translates to: (e.g. 1st test for this IE) # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) # hence this works info['series'] = breadcrumb[-2] diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 38c72c2d6e..a534703e53 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -331,7 +331,7 @@ class InfoExtractor: playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string - specifying the criteria for embedability (Eg: 'whitelist') + specifying the criteria for embedability; e.g. 'whitelist' availability: Under what condition the video is available. One of 'private', 'premium_only', 'subscriber_only', 'needs_auth', 'unlisted' or 'public'. Use 'InfoExtractor._availability' @@ -452,8 +452,8 @@ class InfoExtractor: _extract_from_webpage may raise self.StopExtraction() to stop further processing of the webpage and obtain exclusive rights to it. This is useful - when the extractor cannot reliably be matched using just the URL. - Eg: invidious/peertube instances + when the extractor cannot reliably be matched using just the URL, + e.g. invidious/peertube instances Embed-only extractors can be defined by setting _VALID_URL = False. @@ -2367,7 +2367,7 @@ def build_stream_name(): audio_group_id = last_stream_inf.get('AUDIO') # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] + # However, this is not always respected. E.g. [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite # referencing an audio group it represents a complete @@ -3003,8 +3003,8 @@ def add_segment_url(): segment_number += 1 segment_time += segment_d elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # No media template, + # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] segment_index = 0 @@ -3021,7 +3021,7 @@ def add_segment_url(): representation_ms_info['fragments'] = fragments elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 # https://github.com/ytdl-org/youtube-dl/pull/14844 fragments = [] segment_duration = float_or_none( @@ -3249,8 +3249,8 @@ def _media_formats(src, cur_media_type, type_info=None): media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see - # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). + # https://github.com/ytdl-org/youtube-dl/issues/11979, + # e.g. http://www.porntrex.com/maps/videositemap.xml). r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage)) for media_tag, _, media_type, media_content in media_tags: media_info = { @@ -3706,7 +3706,7 @@ def description(cls, *, markdown=True, search_examples=None): desc += f'; "{cls.SEARCH_KEY}:" prefix' if search_examples: _COUNTS = ('', '5', '10', 'all') - desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' + desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' if not cls.working(): desc += ' (**Currently broken**)' if markdown else ' (Currently broken)' diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index d3ed7ce461..e32ec1c8fa 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -3035,7 +3035,7 @@ def filter_video(urls): self.report_detected('Twitter card') if not found: # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) + # We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index f844ee6fbf..f12a0eff11 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -169,7 +169,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w In most cases you don't need to add any `jscode`. It is executed in `page.onLoadFinished`. `saveAndExit();` is mandatory, use it instead of `phantom.exit()` - It is possible to wait for some element on the webpage, for example: + It is possible to wait for some element on the webpage, e.g. var check = function() { var elementFound = page.evaluate(function() { return document.querySelector('#b.done') !== null; diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ef289e48ce..5ac481bd76 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3247,9 +3247,9 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) # Some formats may have much smaller duration than others (possibly damaged during encoding) - # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 + # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 # Make sure to avoid false positives with small duration differences. - # Eg: __2ABJjxzNo, ySuUZEjARPY + # E.g. __2ABJjxzNo, ySuUZEjARPY is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) if is_damaged: self.report_warning( @@ -5834,7 +5834,7 @@ def _real_extract(self, url): class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): - IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)' + IE_DESC = 'YouTube music search URLs with selectable sections, e.g. #songs' IE_NAME = 'youtube:music:search_url' _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' _TESTS = [{ diff --git a/yt_dlp/minicurses.py b/yt_dlp/minicurses.py index a867fd2898..7db02cb59c 100644 --- a/yt_dlp/minicurses.py +++ b/yt_dlp/minicurses.py @@ -34,7 +34,7 @@ def format_text(text, f): ''' @param f String representation of formatting to apply in the form: [style] [light] font_color [on [light] bg_color] - Eg: "red", "bold green on light blue" + E.g. "red", "bold green on light blue" ''' f = f.upper() tokens = f.strip().split() diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 2c7f686dde..9d75c39769 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -77,7 +77,7 @@ def add_config(label, path, user=False): if root.parse_known_args()[0].ignoreconfig: return False # Multiple package names can be given here - # Eg: ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for + # E.g. ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for # the configuration file of any of these three packages for package in ('yt-dlp',): if user: @@ -374,7 +374,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): dest='default_search', metavar='PREFIX', help=( 'Use this prefix for unqualified URLs. ' - 'Eg: "gvsearch2:python" downloads two videos from google videos for the search term "python". ' + 'E.g. "gvsearch2:python" downloads two videos from google videos for the search term "python". ' 'Use the value "auto" to let yt-dlp guess ("auto_warning" to emit a warning when guessing). ' '"error" just throws an error. The default value "fixup_error" repairs broken URLs, ' 'but emits an error if this is not possible instead of searching')) @@ -459,7 +459,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=( 'Create aliases for an option string. Unless an alias starts with a dash "-", it is prefixed with "--". ' 'Arguments are parsed according to the Python string formatting mini-language. ' - 'Eg: --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' + 'E.g. --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' '"--get-audio" and "-X" that takes an argument (ARG0) and expands to ' '"-S=aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' @@ -471,8 +471,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--proxy', dest='proxy', default=None, metavar='URL', help=( - 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable SOCKS proxy, specify a proper scheme. ' - 'Eg: socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection')) + 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable SOCKS proxy, specify a proper scheme, ' + 'e.g. socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection')) network.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, metavar='SECONDS', @@ -537,7 +537,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Comma separated playlist_index of the videos to download. ' 'You can specify a range using "[START]:[STOP][:STEP]". For backward compatibility, START-STOP is also supported. ' 'Use negative indices to count from the right and negative STEP to download in reverse order. ' - 'Eg: "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15')) + 'E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15')) selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', @@ -549,17 +549,17 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): selection.add_option( '--min-filesize', metavar='SIZE', dest='min_filesize', default=None, - help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') + help='Do not download any videos smaller than SIZE, e.g. 50k or 44.6M') selection.add_option( '--max-filesize', metavar='SIZE', dest='max_filesize', default=None, - help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') + help='Do not download any videos larger than SIZE, e.g. 50k or 44.6M') selection.add_option( '--date', metavar='DATE', dest='date', default=None, help=( 'Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format ' - '[now|today|yesterday][-N[day|week|month|year]]. Eg: --date today-2weeks')) + '[now|today|yesterday][-N[day|week|month|year]]. E.g. --date today-2weeks')) selection.add_option( '--datebefore', metavar='DATE', dest='datebefore', default=None, @@ -589,7 +589,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' - 'the filter matches if atleast one of the conditions are met. Eg: --match-filter ' + 'the filter matches if atleast one of the conditions are met. E.g. --match-filter ' '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' @@ -785,7 +785,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--merge-output-format', action='store', dest='merge_output_format', metavar='FORMAT', default=None, help=( - 'Containers that may be used when merging formats, separated by "/" (Eg: "mp4/mkv"). ' + 'Containers that may be used when merging formats, separated by "/", e.g. "mp4/mkv". ' 'Ignored if no merge is required. ' f'(currently supported: {", ".join(sorted(FFmpegMergerPP.SUPPORTED_EXTS))})')) video_format.add_option( @@ -825,14 +825,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): subtitles.add_option( '--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', default='best', - help='Subtitle format; accepts formats preference, Eg: "srt" or "ass/srt/best"') + help='Subtitle format; accepts formats preference, e.g. "srt" or "ass/srt/best"') subtitles.add_option( '--sub-langs', '--srt-langs', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_list_from_options_callback, help=( - 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs "en.*,ja") ' - 'You can prefix the language code with a "-" to exclude it from the requested languages. (Eg: --sub-langs all,-live_chat) ' + 'Languages of the subtitles to download (can be regex) or "all" separated by commas, e.g. --sub-langs "en.*,ja". ' + 'You can prefix the language code with a "-" to exclude it from the requested languages, e.g. --sub-langs all,-live_chat. ' 'Use --list-subs for a list of available language tags')) downloader = optparse.OptionGroup(parser, 'Download Options') @@ -843,11 +843,11 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): downloader.add_option( '-r', '--limit-rate', '--rate-limit', dest='ratelimit', metavar='RATE', - help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') + help='Maximum download rate in bytes per second, e.g. 50K or 4.2M') downloader.add_option( '--throttled-rate', dest='throttledratelimit', metavar='RATE', - help='Minimum download rate in bytes per second below which throttling is assumed and the video data is re-extracted (e.g. 100K)') + help='Minimum download rate in bytes per second below which throttling is assumed and the video data is re-extracted, e.g. 100K') downloader.add_option( '-R', '--retries', dest='retries', metavar='RETRIES', default=10, @@ -871,8 +871,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Time to sleep between retries in seconds (optionally) prefixed by the type of retry ' '(http (default), fragment, file_access, extractor) to apply the sleep to. ' 'EXPR can be a number, linear=START[:END[:STEP=1]] or exp=START[:END[:BASE=2]]. ' - 'This option can be used multiple times to set the sleep for the different retry types. ' - 'Eg: --retry-sleep linear=1::2 --retry-sleep fragment:exp=1:20')) + 'This option can be used multiple times to set the sleep for the different retry types, ' + 'e.g. --retry-sleep linear=1::2 --retry-sleep fragment:exp=1:20')) downloader.add_option( '--skip-unavailable-fragments', '--no-abort-on-unavailable-fragment', action='store_true', dest='skip_unavailable_fragments', default=True, @@ -892,7 +892,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', - help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') + help='Size of download buffer, e.g. 1024 or 16K (default is %default)') downloader.add_option( '--resize-buffer', action='store_false', dest='noresizebuffer', @@ -905,7 +905,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--http-chunk-size', dest='http_chunk_size', metavar='SIZE', default=None, help=( - 'Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'Size of a chunk for chunk-based HTTP downloading, e.g. 10485760 or 10M (default is disabled). ' 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)')) downloader.add_option( '--test', @@ -963,8 +963,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=( 'Download only chapters whose title matches the given regular expression. ' 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' - 'Eg: --download-sections "*10:15-15:00" --download-sections "intro". ' - 'Needs ffmpeg. This option can be used multiple times to download multiple sections')) + 'Needs ffmpeg. This option can be used multiple times to download multiple sections, ' + 'e.g. --download-sections "*10:15-15:00" --download-sections "intro"')) downloader.add_option( '--downloader', '--external-downloader', dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str', @@ -978,7 +978,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to use it for. ' f'Currently supports native, {", ".join(sorted(list_external_downloaders()))}. ' 'You can use this option multiple times to set different downloaders for different protocols. ' - 'For example, --downloader aria2c --downloader "dash,m3u8:native" will use ' + 'E.g. --downloader aria2c --downloader "dash,m3u8:native" will use ' 'aria2c for http/ftp downloads, and the native downloader for dash/m3u8 downloads ' '(Alias: --external-downloader)')) downloader.add_option( @@ -1188,7 +1188,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Template for progress outputs, optionally prefixed with one of "download:" (default), ' '"download-title:" (the console title), "postprocess:", or "postprocess-title:". ' 'The video\'s fields are accessible under the "info" key and ' - 'the progress attributes are accessible under "progress" key. E.g.: ' + 'the progress attributes are accessible under "progress" key. E.g. ' # TODO: Document the fields inside "progress" '--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"')) verbosity.add_option( @@ -1488,7 +1488,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Remux the video into another container if necessary ' f'(currently supported: {", ".join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)}). ' 'If target container does not support the video/audio codec, remuxing will fail. You can specify multiple rules; ' - 'Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv')) + 'e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv')) postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, @@ -1513,7 +1513,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'You can also specify "PP+EXE:ARGS" to give the arguments to the specified executable ' 'only when being used by the specified postprocessor. Additionally, for ffmpeg/ffprobe, ' '"_i"/"_o" can be appended to the prefix optionally followed by a number to pass the argument ' - 'before the specified input/output file. Eg: --ppa "Merger+ffmpeg_i1:-v quiet". ' + 'before the specified input/output file, e.g. --ppa "Merger+ffmpeg_i1:-v quiet". ' 'You can use this option multiple times to give different arguments to different ' 'postprocessors. (Alias: --ppa)')) postproc.add_option( @@ -1729,7 +1729,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'SponsorBlock categories to create chapters for, separated by commas. ' f'Available categories are {", ".join(SponsorBlockPP.CATEGORIES.keys())}, all and default (=all). ' 'You can prefix the category with a "-" to exclude it. See [1] for description of the categories. ' - 'Eg: --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) + 'E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) sponsorblock.add_option( '--sponsorblock-remove', metavar='CATS', dest='sponsorblock_remove', default=set(), action='callback', type='str', diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 39a41d5b8a..e64d359365 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -610,7 +610,7 @@ def sanitize_open(filename, open_mode): if sys.platform == 'win32': import msvcrt - # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout + # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout with contextlib.suppress(io.UnsupportedOperation): msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) @@ -786,8 +786,8 @@ def _htmlentity_transform(entity_with_semicolon): if entity in html.entities.name2codepoint: return chr(html.entities.name2codepoint[entity]) - # TODO: HTML5 allows entities without a semicolon. For example, - # 'Éric' should be decoded as 'Éric'. + # TODO: HTML5 allows entities without a semicolon. + # E.g. 'Éric' should be decoded as 'Éric'. if entity_with_semicolon in html.entities.html5: return html.entities.html5[entity_with_semicolon] From 8f84770acd7b70e7f6876f9ea8c5b1f4f0497b66 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 14 Aug 2022 07:17:11 +0530 Subject: [PATCH 031/284] [utils] Fix `get_compatible_ext` Closes #4647 --- yt_dlp/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index e64d359365..db355ec92a 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3501,8 +3501,8 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): }, } - sanitize_codec = functools.partial(try_get, getter=lambda x: x.split('.')[0].replace('0', '')) - vcodec, acodec = sanitize_codec(vcodecs[0]), sanitize_codec(acodecs[0]) + sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) + vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) for ext in preferences or COMPATIBLE_CODECS.keys(): codec_set = COMPATIBLE_CODECS.get(ext, set()) From a6125983ab4434fc4079f575a4bf22042411ea5e Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 14 Aug 2022 19:03:58 +0530 Subject: [PATCH 032/284] [update] Set executable bit-mask Closes #4621 --- yt_dlp/update.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index a04518c9b6..a5cd11150c 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -9,7 +9,7 @@ from zipimport import zipimporter from .compat import functools # isort: split -from .compat import compat_realpath +from .compat import compat_realpath, compat_shlex_quote from .utils import ( Popen, cached_method, @@ -229,24 +229,32 @@ def update(self): except OSError: return self._report_permission_error(new_filename) - try: - if old_filename: + if old_filename: + try: os.rename(self.filename, old_filename) - except OSError: - return self._report_error('Unable to move current version') - try: - if old_filename: - os.rename(new_filename, self.filename) - except OSError: - self._report_error('Unable to overwrite current version') - return os.rename(old_filename, self.filename) + except OSError: + return self._report_error('Unable to move current version') - if detect_variant() not in ('win32_exe', 'py2exe'): - if old_filename: - os.remove(old_filename) - else: + try: + os.rename(new_filename, self.filename) + except OSError: + self._report_error('Unable to overwrite current version') + return os.rename(old_filename, self.filename) + + if detect_variant() in ('win32_exe', 'py2exe'): atexit.register(Popen, f'ping 127.0.0.1 -n 5 -w 1000 & del /F "{old_filename}"', shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + elif old_filename: + try: + os.remove(old_filename) + except OSError: + self._report_error('Unable to remove the old version') + + try: + os.chmod(self.filename, 0o777) + except OSError: + return self._report_error( + f'Unable to set permissions. Run: sudo chmod a+rx {compat_shlex_quote(self.filename)}') self.ydl.to_screen(f'Updated yt-dlp to version {self.new_version}') return True From 0e0ce898f6226f712064a8e809cf3c5690789cce Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 14 Aug 2022 20:34:55 +0530 Subject: [PATCH 033/284] [ThumbnailsConvertor] Fix conversion after fixup_webp Closes #4565 --- yt_dlp/postprocessor/ffmpeg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 6a0a8220ba..a1f367ae42 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -1105,6 +1105,7 @@ def run(self, info): continue has_thumbnail = True self.fixup_webp(info, idx) + original_thumbnail = thumbnail_dict['filepath'] # Path can change during fixup thumbnail_ext = os.path.splitext(original_thumbnail)[1][1:].lower() if thumbnail_ext == 'jpeg': thumbnail_ext = 'jpg' From 66c4afd82892a12cfd9174750b6e12dfaa1d0fcb Mon Sep 17 00:00:00 2001 From: Aldo Ridhoni <aldoridhoni@gmail.com> Date: Mon, 15 Aug 2022 03:43:03 +0800 Subject: [PATCH 034/284] [extractor/doodstream] Add `wf` domain (#4648) Authored by: aldoridhoni --- yt_dlp/extractor/doodstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/doodstream.py b/yt_dlp/extractor/doodstream.py index 0b4e5ccbd5..b41da32e51 100644 --- a/yt_dlp/extractor/doodstream.py +++ b/yt_dlp/extractor/doodstream.py @@ -6,7 +6,7 @@ class DoodStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch|so|pm)/[ed]/(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch|so|pm|wf)/[ed]/(?P<id>[a-z0-9]+)' _TESTS = [{ 'url': 'http://dood.to/e/5s1wmbdacezb', 'md5': '4568b83b31e13242b3f1ff96c55f0595', From 7e823974414dba7a8ae4d703c511f92a374a0a50 Mon Sep 17 00:00:00 2001 From: nixxo <nixxo@protonmail.com> Date: Sun, 14 Aug 2022 21:47:55 +0200 Subject: [PATCH 035/284] [extractor/rai] Misc fixes (#4600) Authored by: nixxo --- yt_dlp/extractor/rai.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index a73fe37376..dc911069dc 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -51,6 +51,9 @@ def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) + if xpath_text(relinker, './license_url', default='{}') != '{}': + self.report_drm(video_id) + if not geoprotection: geoprotection = xpath_text( relinker, './geoprotection', default=None) == 'Y' @@ -251,6 +254,8 @@ class RaiPlayIE(RaiBaseIE): }, 'release_year': 2022, 'episode': 'Espresso nel caffè - 07/04/2014', + 'timestamp': 1396919880, + 'upload_date': '20140408', }, 'params': { 'skip_download': True, @@ -274,6 +279,8 @@ class RaiPlayIE(RaiBaseIE): 'release_year': 2021, 'season_number': 1, 'episode': 'Senza occhi', + 'timestamp': 1637318940, + 'upload_date': '20211119', }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', @@ -284,7 +291,7 @@ class RaiPlayIE(RaiBaseIE): 'only_matching': True, }, { # DRM protected - 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', + 'url': 'https://www.raiplay.it/video/2021/06/Lo-straordinario-mondo-di-Zoey-S2E1-Lo-straordinario-ritorno-di-Zoey-3ba992de-2332-41ad-9214-73e32ab209f4.html', 'only_matching': True, }] @@ -363,6 +370,8 @@ class RaiPlayLiveIE(RaiPlayIE): 'creator': 'Rai News 24', 'is_live': True, 'live_status': 'is_live', + 'upload_date': '20090502', + 'timestamp': 1241276220, }, 'params': { 'skip_download': True, @@ -448,6 +457,8 @@ class RaiPlaySoundIE(RaiBaseIE): 'series': 'Il Ruggito del Coniglio', 'episode': 'Il Ruggito del Coniglio del 10/12/2021', 'creator': 'rai radio 2', + 'timestamp': 1638346620, + 'upload_date': '20211201', }, 'params': { 'skip_download': True, @@ -707,7 +718,8 @@ def _real_extract(self, url): class RaiNewsIE(RaiIE): - _VALID_URL = rf'https?://(www\.)?rainews\.it/[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] _TESTS = [{ # new rainews player (#3911) 'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html', @@ -732,6 +744,10 @@ class RaiNewsIE(RaiIE): 'upload_date': '20161103' }, 'expected_warnings': ['unable to extract player_data'], + }, { + # iframe + drm + 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -755,6 +771,7 @@ def _real_extract(self, url): raise ExtractorError('Relinker URL not found', cause=e) relinker_info = self._extract_relinker_info(urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) return { @@ -769,13 +786,13 @@ def _real_extract(self, url): class RaiSudtirolIE(RaiBaseIE): _VALID_URL = r'https?://raisudtirol\.rai\.it/.+?media=(?P<id>[TP]tv\d+)' _TESTS = [{ - 'url': 'https://raisudtirol.rai.it/de/index.php?media=Ttv1656281400', + 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', 'info_dict': { - 'id': 'Ttv1656281400', + 'id': 'Ptv1619729460', 'ext': 'mp4', - 'title': 'Tagesschau + Sport am Sonntag - 31-07-2022 20:00', - 'series': 'Tagesschau + Sport am Sonntag', - 'upload_date': '20220731', + 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', + 'series': 'Euro: trasmisciun d\'economia', + 'upload_date': '20210429', 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+?\.jpg', 'uploader': 'raisudtirol', } @@ -796,6 +813,14 @@ def _real_extract(self, url): 'series': video_title, 'upload_date': unified_strdate(video_date), 'thumbnail': urljoin('https://raisudtirol.rai.it/', video_thumb), - 'url': self._proto_relative_url(video_url), 'uploader': 'raisudtirol', + 'formats': [{ + 'format_id': 'https-mp4', + 'url': self._proto_relative_url(video_url), + 'width': 1024, + 'height': 576, + 'fps': 25, + 'vcodec': 'h264', + 'acodec': 'aac', + }], } From 43cf982ac353c6e257c4d8fadb02c20491a007fb Mon Sep 17 00:00:00 2001 From: Ben Welsh <b@palewi.re> Date: Sun, 14 Aug 2022 13:01:16 -0700 Subject: [PATCH 036/284] [extractor/parler] Add extractor (#4616) Authored by: palewire --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/parler.py | 114 ++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 yt_dlp/extractor/parler.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 73795ddc5f..0503f4c0c9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1236,6 +1236,7 @@ ParamountPlusIE, ParamountPlusSeriesIE, ) +from .parler import ParlerIE from .parlview import ParlviewIE from .patreon import ( PatreonIE, diff --git a/yt_dlp/extractor/parler.py b/yt_dlp/extractor/parler.py new file mode 100644 index 0000000000..5d60134e0c --- /dev/null +++ b/yt_dlp/extractor/parler.py @@ -0,0 +1,114 @@ +import json + +from .common import InfoExtractor +from .youtube import YoutubeIE + +from ..utils import ( + clean_html, + format_field, + int_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, + urlencode_postdata, +) + + +class ParlerIE(InfoExtractor): + IE_DESC = 'Posts on parler.com' + _VALID_URL = r'https://parler\.com/feed/(?P<id>[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' + _TESTS = [ + { + 'url': 'https://parler.com/feed/df79fdba-07cc-48fe-b085-3293897520d7', + 'md5': '16e0f447bf186bb3cf64de5bbbf4d22d', + 'info_dict': { + 'id': 'df79fdba-07cc-48fe-b085-3293897520d7', + 'ext': 'mp4', + 'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg', + 'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7', + 'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197', + 'timestamp': 1659744000, + 'upload_date': '20220806', + 'uploader': 'Tulsi Gabbard', + 'uploader_id': 'TulsiGabbard', + 'uploader_url': 'https://parler.com/TulsiGabbard', + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + 'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287', + 'md5': '11687e2f5bb353682cee338d181422ed', + 'info_dict': { + 'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287', + 'ext': 'mp4', + 'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg', + 'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287', + 'description': 'This man should run for office', + 'timestamp': 1659657600, + 'upload_date': '20220805', + 'uploader': 'Benny Johnson', + 'uploader_id': 'BennyJohnson', + 'uploader_url': 'https://parler.com/BennyJohnson', + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + 'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5', + 'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4', + 'info_dict': { + 'id': 'r5vkSaz8PxQ', + 'ext': 'mp4', + 'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp', + 'title': 'Tom MacDonald Names Reaction', + 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', + 'upload_date': '20220716', + 'duration': 1267, + 'uploader': 'Mahesh Chookolingo', + 'uploader_id': 'maheshchookolingo', + 'uploader_url': 'http://www.youtube.com/user/maheshchookolingo', + 'channel': 'Mahesh Chookolingo', + 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', + 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', + 'categories': ['Entertainment'], + 'tags': list, + 'availability': 'public', + 'live_status': 'not_live', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'age_limit': 0, + 'playable_in_embed': True, + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id, + data=urlencode_postdata({'uuid': video_id}))['data'][0] + primary = data['primary'] + + embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False) + if embed: + return self.url_result(embed[0], YoutubeIE) + + return { + 'id': video_id, + 'url': traverse_obj(primary, ('video_data', 'videoSrc')), + 'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')), + 'title': '', + 'description': strip_or_none(clean_html(primary.get('full_body'))) or None, + 'timestamp': unified_timestamp(primary.get('date_created')), + 'uploader': strip_or_none(primary.get('name')), + 'uploader_id': strip_or_none(primary.get('username')), + 'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'), + 'view_count': int_or_none(primary.get('view_count')), + 'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))), + 'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))), + } From 63be30e3e06a11d1243032ef7f444e4e276470d4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 14 Aug 2022 20:03:24 +0000 Subject: [PATCH 037/284] [extractor/facebook] Add reel support (#4660) Closes #4039 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/facebook.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0503f4c0c9..34f43cc1e7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -500,6 +500,7 @@ FacebookIE, FacebookPluginsVideoIE, FacebookRedirectURLIE, + FacebookReelIE, ) from .fancode import ( FancodeVodIE, diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index d434b359ae..35acbc6430 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -772,3 +772,30 @@ def _real_extract(self, url): if not redirect_url: raise ExtractorError('Invalid facebook redirect URL', expected=True) return self.url_result(redirect_url) + + +class FacebookReelIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/reel/(?P<id>\d+)' + IE_NAME = 'facebook:reel' + + _TESTS = [{ + 'url': 'https://www.facebook.com/reel/1195289147628387', + 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'info_dict': { + 'id': '1195289147628387', + 'ext': 'mp4', + 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', + 'description': 'md5:24ea7ef062215d295bdde64e778f5474', + 'uploader': 'Beast Camp Training', + 'uploader_id': '1738535909799870', + 'duration': 9.536, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20211121', + 'timestamp': 1637502604, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) From cb7cc448c0b7508215a45af0b81506403f61ef05 Mon Sep 17 00:00:00 2001 From: Ben Welsh <b@palewi.re> Date: Sun, 14 Aug 2022 13:06:04 -0700 Subject: [PATCH 038/284] [extractor/truth] Add extractor (#4609) Closes #3865 Authored by: palewire --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/truth.py | 69 +++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 yt_dlp/extractor/truth.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 34f43cc1e7..eb61ad3869 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1794,6 +1794,7 @@ ) from .trueid import TrueIDIE from .trunews import TruNewsIE +from .truth import TruthIE from .trutv import TruTVIE from .tube8 import Tube8IE from .tubetugraz import TubeTuGrazIE, TubeTuGrazSeriesIE diff --git a/yt_dlp/extractor/truth.py b/yt_dlp/extractor/truth.py new file mode 100644 index 0000000000..1c6409ce24 --- /dev/null +++ b/yt_dlp/extractor/truth.py @@ -0,0 +1,69 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + format_field, + int_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, +) + + +class TruthIE(InfoExtractor): + _VALID_URL = r'https?://truthsocial\.com/@[^/]+/posts/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://truthsocial.com/@realDonaldTrump/posts/108779000807761862', + 'md5': '4a5fb1470c192e493d9efd6f19e514d3', + 'info_dict': { + 'id': '108779000807761862', + 'ext': 'qt', + 'title': 'Truth video #108779000807761862', + 'description': None, + 'timestamp': 1659835827, + 'upload_date': '20220807', + 'uploader': 'Donald J. Trump', + 'uploader_id': 'realDonaldTrump', + 'uploader_url': 'https://truthsocial.com/@realDonaldTrump', + 'repost_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + { + 'url': 'https://truthsocial.com/@ProjectVeritasAction/posts/108618228543962049', + 'md5': 'fd47ba68933f9dce27accc52275be9c3', + 'info_dict': { + 'id': '108618228543962049', + 'ext': 'mp4', + 'title': 'md5:debde7186cf83f60ff7b44dbb9444e35', + 'description': 'md5:de2fc49045bf92bb8dc97e56503b150f', + 'timestamp': 1657382637, + 'upload_date': '20220709', + 'uploader': 'Project Veritas Action', + 'uploader_id': 'ProjectVeritasAction', + 'uploader_url': 'https://truthsocial.com/@ProjectVeritasAction', + 'repost_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + status = self._download_json(f'https://truthsocial.com/api/v1/statuses/{video_id}', video_id) + uploader_id = strip_or_none(traverse_obj(status, ('account', 'username'))) + return { + 'id': video_id, + 'url': status['media_attachments'][0]['url'], + 'title': '', + 'description': strip_or_none(clean_html(status.get('content'))) or None, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader': strip_or_none(traverse_obj(status, ('account', 'display_name'))), + 'uploader_id': uploader_id, + 'uploader_url': format_field(uploader_id, None, 'https://truthsocial.com/@%s'), + 'repost_count': int_or_none(status.get('reblogs_count')), + 'like_count': int_or_none(status.get('favourites_count')), + 'comment_count': int_or_none(status.get('replies_count')), + } From 7695f5a0a758477608c68492fc00144cdad1c3bc Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Mon, 15 Aug 2022 05:09:05 +0900 Subject: [PATCH 039/284] [extractor/moview] Add extractor (#4607) Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jixie.py | 51 +++++++++++++++++++++++++++++++++ yt_dlp/extractor/kompas.py | 48 +++---------------------------- yt_dlp/extractor/moview.py | 43 +++++++++++++++++++++++++++ 4 files changed, 99 insertions(+), 44 deletions(-) create mode 100644 yt_dlp/extractor/jixie.py create mode 100644 yt_dlp/extractor/moview.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index eb61ad3869..2195472b72 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -975,6 +975,7 @@ from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviepilot import MoviepilotIE +from .moview import MoviewPlayIE from .moviezine import MoviezineIE from .movingimage import MovingImageIE from .msn import MSNIE diff --git a/yt_dlp/extractor/jixie.py b/yt_dlp/extractor/jixie.py new file mode 100644 index 0000000000..3bb685e016 --- /dev/null +++ b/yt_dlp/extractor/jixie.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + traverse_obj, + try_call, +) + +# more info about jixie: +# [1] https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, +# [2] https://scripts.jixie.media/jxvideo.3.1.min.js + + +class JixieBaseIE(InfoExtractor): + def _extract_data_from_jixie_id(self, display_id, video_id, webpage): + json_data = self._download_json( + 'https://apidam.jixie.io/api/public/stream', display_id, + query={'metadata': 'full', 'video_id': video_id})['data'] + + formats, subtitles = [], {} + for stream in json_data['streams']: + if stream.get('type') == 'HLS': + fmt, sub = self._extract_m3u8_formats_and_subtitles(stream.get('url'), display_id, ext='mp4') + if json_data.get('drm'): + for f in fmt: + f['has_drm'] = True + formats.extend(fmt) + self._merge_subtitles(sub, target=subtitles) + else: + formats.append({ + 'url': stream.get('url'), + 'width': stream.get('width'), + 'height': stream.get('height'), + 'ext': 'mp4', + }) + + self._sort_formats(formats) + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': json_data.get('title') or self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': (clean_html(traverse_obj(json_data, ('metadata', 'description'))) + or self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage)), + 'thumbnails': traverse_obj(json_data, ('metadata', 'thumbnails')), + 'duration': float_or_none(traverse_obj(json_data, ('metadata', 'duration'))), + 'tags': try_call(lambda: (json_data['metadata']['keywords'] or None).split(',')), + 'categories': try_call(lambda: (json_data['metadata']['categories'] or None).split(',')), + 'uploader_id': json_data.get('owner_id'), + } diff --git a/yt_dlp/extractor/kompas.py b/yt_dlp/extractor/kompas.py index d400c42f37..03f5f30bd7 100644 --- a/yt_dlp/extractor/kompas.py +++ b/yt_dlp/extractor/kompas.py @@ -1,17 +1,9 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, - traverse_obj, - try_call, -) +from .jixie import JixieBaseIE -# Video from www.kompas.tv and video.kompas.com seems use jixie player -# see [1] https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, -# [2] https://scripts.jixie.media/jxvideo.3.1.min.js for more info +# Video from video.kompas.com seems use jixie player -class KompasVideoIE(InfoExtractor): +class KompasVideoIE(JixieBaseIE): _VALID_URL = r'https?://video\.kompas\.com/\w+/(?P<id>\d+)/(?P<slug>[\w-]+)' _TESTS = [{ 'url': 'https://video.kompas.com/watch/164474/kim-jong-un-siap-kirim-nuklir-lawan-as-dan-korsel', @@ -33,36 +25,4 @@ def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'slug') webpage = self._download_webpage(url, display_id) - json_data = self._download_json( - 'https://apidam.jixie.io/api/public/stream', display_id, - query={'metadata': 'full', 'video_id': video_id})['data'] - - formats, subtitles = [], {} - for stream in json_data['streams']: - if stream.get('type') == 'HLS': - fmt, sub = self._extract_m3u8_formats_and_subtitles(stream.get('url'), display_id, ext='mp4') - formats.extend(fmt) - self._merge_subtitles(sub, target=subtitles) - else: - formats.append({ - 'url': stream.get('url'), - 'width': stream.get('width'), - 'height': stream.get('height'), - 'ext': 'mp4', - }) - - self._sort_formats(formats) - return { - 'id': video_id, - 'display_id': display_id, - 'formats': formats, - 'subtitles': subtitles, - 'title': json_data.get('title') or self._html_search_meta(['og:title', 'twitter:title'], webpage), - 'description': (clean_html(traverse_obj(json_data, ('metadata', 'description'))) - or self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage)), - 'thumbnails': traverse_obj(json_data, ('metadata', 'thumbnails')), - 'duration': float_or_none(traverse_obj(json_data, ('metadata', 'duration'))), - 'tags': try_call(lambda: json_data['metadata']['keywords'].split(',')), - 'categories': try_call(lambda: json_data['metadata']['categories'].split(',')), - 'uploader_id': json_data.get('owner_id'), - } + return self._extract_data_from_jixie_id(display_id, video_id, webpage) diff --git a/yt_dlp/extractor/moview.py b/yt_dlp/extractor/moview.py new file mode 100644 index 0000000000..678b2eb06e --- /dev/null +++ b/yt_dlp/extractor/moview.py @@ -0,0 +1,43 @@ +from .jixie import JixieBaseIE + + +class MoviewPlayIE(JixieBaseIE): + _VALID_URL = r'https?://www\.moview\.id/play/\d+/(?P<id>[\w-]+)' + _TESTS = [ + { + # drm hls, only use direct link + 'url': 'https://www.moview.id/play/174/Candy-Monster', + 'info_dict': { + 'id': '146182', + 'ext': 'mp4', + 'display_id': 'Candy-Monster', + 'uploader_id': 'Mo165qXUUf', + 'duration': 528.2, + 'title': 'Candy Monster', + 'description': 'Mengapa Candy Monster ingin mengambil permen Chloe?', + 'thumbnail': 'https://video.jixie.media/1034/146182/146182_1280x720.jpg', + } + }, { + # non-drm hls + 'url': 'https://www.moview.id/play/75/Paris-Van-Java-Episode-16', + 'info_dict': { + 'id': '28210', + 'ext': 'mp4', + 'duration': 2595.666667, + 'display_id': 'Paris-Van-Java-Episode-16', + 'uploader_id': 'Mo165qXUUf', + 'thumbnail': 'https://video.jixie.media/1003/28210/28210_1280x720.jpg', + 'description': 'md5:2a5e18d98eef9b39d7895029cac96c63', + 'title': 'Paris Van Java Episode 16', + } + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'video_id\s*=\s*"(?P<video_id>[^"]+)', webpage, 'video_id') + + return self._extract_data_from_jixie_id(display_id, video_id, webpage) From e183bb8c9b12a3d600b570dc1a0ec064df3a24f2 Mon Sep 17 00:00:00 2001 From: ischmidt20 <ischmidt20@berkeley.edu> Date: Sun, 14 Aug 2022 16:17:18 -0400 Subject: [PATCH 040/284] [extractor/MLB] New extractor (#4586) Authored by: ischmidt20 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mlb.py | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2195472b72..d70302548e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -957,6 +957,7 @@ from .mlb import ( MLBIE, MLBVideoIE, + MLBTVIE, ) from .mlssoccer import MLSSoccerIE from .mnet import MnetIE diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index dd1f54f871..48baecc47a 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -1,11 +1,15 @@ import re +import urllib.parse +import uuid from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, + traverse_obj, try_get, ) @@ -267,3 +271,79 @@ def _download_video_data(self, display_id): } }''' % display_id, })['data']['mediaPlayback'][0] + + +class MLBTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})' + _NETRC_MACHINE = 'mlb' + + _TESTS = [{ + 'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638', + 'info_dict': { + 'id': '661581', + 'ext': 'mp4', + 'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies', + }, + 'params': { + 'skip_download': True, + }, + }] + _access_token = None + + def _real_initialize(self): + if not self._access_token: + self.raise_login_required( + 'All videos are only available to registered users', method='password') + + def _perform_login(self, username, password): + data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356' + access_token = self._download_json( + 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, + headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=data.encode())['access_token'] + + entitlement = self._download_webpage( + f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={str(uuid.uuid4())}', None, + headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Authorization': f'Bearer {access_token}' + }) + + data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv' + self._access_token = self._download_json( + 'https://us.edge.bamgrid.com/token', None, + headers={ + 'Accept': 'application/json', + 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk', + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=data.encode())['access_token'] + + def _real_extract(self, url): + video_id = self._match_id(url) + airings = self._download_json( + f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D', + video_id)['data']['Airings'] + + formats, subtitles = [], {} + for airing in airings: + m3u8_url = self._download_json( + airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id, + headers={ + 'Authorization': self._access_token, + 'Accept': 'application/vnd.media-service+json; version=2' + })['stream']['complete'] + f, s = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id=join_nonempty(airing.get('feedType'), airing.get('feedLanguage'))) + formats.extend(f) + self._merge_subtitles(s, target=subtitles) + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False), + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': {'Authorization': f'Bearer {self._access_token}'}, + } From ef6342bd07c7bd1e41b0cc8889bcfadfab3477f2 Mon Sep 17 00:00:00 2001 From: masta79 <ne-github@erfurth.eu> Date: Mon, 15 Aug 2022 00:01:41 +0200 Subject: [PATCH 041/284] [extractor/toggo] Improve `_VALID_URL` (#4663) Authored by: masta79 --- yt_dlp/extractor/toggo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/toggo.py b/yt_dlp/extractor/toggo.py index 9f98cfaf0c..1ddec493d8 100644 --- a/yt_dlp/extractor/toggo.py +++ b/yt_dlp/extractor/toggo.py @@ -4,7 +4,7 @@ class ToggoIE(InfoExtractor): IE_NAME = 'toggo' - _VALID_URL = r'https?://(?:www\.)?toggo\.de/(?:toggolino/)?[^/?#]+/folge/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?toggo\.de/(?:toggolino/)?[^/?#]+/(?:folge|video)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.toggo.de/weihnachtsmann--co-kg/folge/ein-geschenk-fuer-zwei', 'info_dict': { @@ -33,6 +33,9 @@ class ToggoIE(InfoExtractor): }, { 'url': 'https://www.toggo.de/toggolino/paw-patrol/folge/der-wetter-zeppelin-der-chili-kochwettbewerb', 'only_matching': True, + }, { + 'url': 'https://www.toggo.de/toggolino/paw-patrol/video/paw-patrol-rettung-im-anflug', + 'only_matching': True, }] def _real_extract(self, url): From 6440c45ff3c3209593c0f39af075e71e4ca0299a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 14 Aug 2022 22:51:38 +0530 Subject: [PATCH 042/284] [update] Copy bitmask from old binary Improves a6125983ab4434fc4079f575a4bf22042411ea5e Authored by: Lesmiscore --- yt_dlp/update.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index a5cd11150c..fc96f29850 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -230,6 +230,7 @@ def update(self): return self._report_permission_error(new_filename) if old_filename: + mask = os.stat(self.filename).st_mode try: os.rename(self.filename, old_filename) except OSError: @@ -251,7 +252,7 @@ def update(self): self._report_error('Unable to remove the old version') try: - os.chmod(self.filename, 0o777) + os.chmod(self.filename, mask) except OSError: return self._report_error( f'Unable to set permissions. Run: sudo chmod a+rx {compat_shlex_quote(self.filename)}') From 48732becfe013849a4191ff467f27b08e04e84fb Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 15 Aug 2022 01:53:42 +0530 Subject: [PATCH 043/284] Fix bug in 1155ecef29187bff975ceb51c755722c660e0387 --- yt_dlp/extractor/zattoo.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 975cc71259..9ce15b3889 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -238,8 +238,8 @@ def _extract_ondemand(self, ondemand_id): return info_dict def _real_extract(self, url): - vid1, vid2 = self._match_valid_url(url).group('vid1', 'vid2') - return getattr(self, f'_extract_{self._TYPE}')(vid1 or vid2) + video_id, record_id = self._match_valid_url(url).groups() + return self._extract_video(video_id, record_id) def _make_valid_url(host): @@ -258,6 +258,10 @@ def _create_valid_url(match, qs, base_re=None): {match_base} )''' + def _real_extract(self, url): + vid1, vid2 = self._match_valid_url(url).group('vid1', 'vid2') + return getattr(self, f'_extract_{self._TYPE}')(vid1 or vid2) + class ZattooIE(ZattooBaseIE): _VALID_URL = ZattooBaseIE._create_valid_url(r'\d+', 'program', '(?:program|watch)/[^/]+') From d711839760e220e561098cf257de43769049d238 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 15 Aug 2022 03:22:57 +0530 Subject: [PATCH 044/284] Update to ytdl-commit-e6a836d [core] Make `--max-downloads ...` stop immediately on reaching the limit https://github.com/ytdl-org/youtube-dl/commit/e6a836d54ca1d3cd02f3ee45ef707a46f23e8291 --- test/test_download.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 787013c342..ee53efa1c4 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -105,11 +105,11 @@ def print_skipping(reason): info_dict = tc.get('info_dict', {}) params = tc.get('params', {}) if not info_dict.get('id'): - raise Exception('Test definition incorrect. \'id\' key is not present') + raise Exception(f'Test {tname} definition incorrect - "id" key is not present') elif not info_dict.get('ext'): if params.get('skip_download') and params.get('ignore_no_formats_error'): continue - raise Exception('Test definition incorrect. The output file cannot be known. \'ext\' key is not present') + raise Exception(f'Test {tname} definition incorrect - "ext" key must be present to define the output file') if 'skip' in test_case: print_skipping(test_case['skip']) @@ -161,7 +161,9 @@ def try_rm_tcs_files(tcs=None): force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one - if not err.exc_info[0] in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine) or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503): + if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine) + or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)): + err.msg = f'{getattr(err, "msg", err)} ({tname})' raise if try_num == RETRIES: From 49b4ceaedf92db85177cfa10542bddbed16529c7 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 15 Aug 2022 03:20:36 +0530 Subject: [PATCH 045/284] [jsinterp] Bring or-par with youtube-dl Partially cherry-picked from: https://github.com/ytdl-org/youtube-dl/commit/d231b56717c73ee597d2e077d11b69ed48a1b02d Authored by pukkandan, dirkf --- README.md | 2 +- test/test_jsinterp.py | 30 +++++++++++++++++++ test/test_youtube_signature.py | 1 + yt_dlp/jsinterp.py | 54 ++++++++++++++++++++++------------ 4 files changed, 67 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 9672a17718..42cbfcebac 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/adb5294](https://github.com/ytdl-org/youtube-dl/commit/adb5294177265ba35b45746dbb600965076ed150)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17+ [commit/d231b56](https://github.com/ytdl-org/youtube-dl/commit/d231b56717c73ee597d2e077d11b69ed48a1b02d)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 48e2abcf66..c97f6dcfb9 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -48,6 +48,9 @@ def test_operators(self): jsi = JSInterpreter('function f(){return 1 << 5;}') self.assertEqual(jsi.call_function('f'), 32) + jsi = JSInterpreter('function f(){return 2 ** 5}') + self.assertEqual(jsi.call_function('f'), 32) + jsi = JSInterpreter('function f(){return 19 & 21;}') self.assertEqual(jsi.call_function('f'), 17) @@ -57,6 +60,12 @@ def test_operators(self): jsi = JSInterpreter('function f(){return []? 2+3: 4;}') self.assertEqual(jsi.call_function('f'), 5) + jsi = JSInterpreter('function f(){return 1 == 2}') + self.assertEqual(jsi.call_function('f'), False) + + jsi = JSInterpreter('function f(){return 0 && 1 || 2;}') + self.assertEqual(jsi.call_function('f'), 2) + def test_array_access(self): jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) @@ -114,6 +123,16 @@ def test_precedence(self): }''') self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) + def test_builtins(self): + jsi = JSInterpreter(''' + function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + jsi = JSInterpreter(''' + function x(dt) { return new Date(dt) - 0; } + ''') + self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } @@ -188,6 +207,17 @@ def test_comma(self): ''') self.assertEqual(jsi.call_function('x'), 7) + jsi = JSInterpreter(''' + function x() { a=5; return (a -= 1, a+=3, a); } + ''') + self.assertEqual(jsi.call_function('x'), 7) + + def test_void(self): + jsi = JSInterpreter(''' + function x() { return void 42; } + ''') + self.assertEqual(jsi.call_function('x'), None) + def test_return_function(self): jsi = JSInterpreter(''' function x() { return [1, function(){return 1}][1] } diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 559bdfccff..79bbfc3237 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -109,6 +109,7 @@ class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( + ('https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', '4c3f79c5'), ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 1af6ee0aa2..87f141476c 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -17,6 +17,8 @@ ) _NAME_RE = r'[a-zA-Z_$][\w$]*' + +# Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence _OPERATORS = { # None => Defined in JSInterpreter._operator '?': None, @@ -26,23 +28,31 @@ '|': operator.or_, '^': operator.xor, - # FIXME: This should actually be below comparision - '>>': operator.rshift, - '<<': operator.lshift, + '===': operator.is_, + '!==': operator.is_not, + '==': operator.eq, + '!=': operator.ne, '<=': operator.le, '>=': operator.ge, '<': operator.lt, '>': operator.gt, + '>>': operator.rshift, + '<<': operator.lshift, + '+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv, '%': operator.mod, + + '**': operator.pow, } +_COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} + _MATCHING_PARENS = dict(zip('({[', ')}]')) _QUOTES = '\'"' @@ -81,7 +91,7 @@ def __delitem__(self, key): class Debugger: import sys - ENABLED = 'pytest' in sys.modules + ENABLED = False and 'pytest' in sys.modules @staticmethod def write(*args, level=100): @@ -200,7 +210,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if should_return: return ret, should_return - m = re.match(r'(?P<var>var\s)|return(?:\s+|$)', stmt) + m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|$)', stmt) if m: expr = stmt[len(m.group(0)):].strip() should_return = not m.group('var') @@ -218,13 +228,18 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): obj = expr[4:] if obj.startswith('Date('): left, right = self._separate_at_paren(obj[4:], ')') - expr = unified_timestamp(left[1:-1], False) + expr = unified_timestamp( + self.interpret_expression(left, local_vars, allow_recursion), False) if not expr: raise self.Exception(f'Failed to parse date {left!r}', expr) expr = self._dump(int(expr * 1000), local_vars) + right else: raise self.Exception(f'Unsupported object {obj}', expr) + if expr.startswith('void '): + left = self.interpret_expression(expr[5:], local_vars, allow_recursion) + return None, should_return + if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) @@ -307,7 +322,8 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if default: matched = matched or case == 'default' elif not matched: - matched = case != 'default' and switch_val == self.interpret_expression(case, local_vars, allow_recursion) + matched = (case != 'default' + and switch_val == self.interpret_expression(case, local_vars, allow_recursion)) if not matched: continue try: @@ -347,7 +363,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): m = re.match(fr'''(?x) (?P<assign> (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* - (?P<op>{"|".join(map(re.escape, _OPERATORS))})? + (?P<op>{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?P<expr>.*)$ )|(?P<return> (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ @@ -397,12 +413,14 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): for op in _OPERATORS: separated = list(self._separate(expr, op)) - if len(separated) < 2: - continue right_expr = separated.pop() - while op == '-' and len(separated) > 1 and not separated[-1].strip(): - right_expr = f'-{right_expr}' + while op in '<>*-' and len(separated) > 1 and not separated[-1].strip(): separated.pop() + right_expr = f'{op}{right_expr}' + if op != '-': + right_expr = f'{separated.pop()}{op}{right_expr}' + if not separated: + continue left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) return self._operator(op, 0 if left_val is None else left_val, right_expr, expr, local_vars, allow_recursion), should_return @@ -564,8 +582,8 @@ def extract_object(self, objname): # Currently, it only supports function definitions fields_m = re.finditer( r'''(?x) - (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} - ''' % _FUNC_NAME_RE, + (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)} + ''' % (_FUNC_NAME_RE, _NAME_RE), fields) for f in fields_m: argnames = f.group('args').split(',') @@ -580,7 +598,7 @@ def extract_function_code(self, funcname): (?: function\s+%(name)s| [{;,]\s*%(name)s\s*=\s*function| - var\s+%(name)s\s*=\s*function + (?:var|const|let)\s+%(name)s\s*=\s*function )\s* \((?P<args>[^)]*)\)\s* (?P<code>{.+})''' % {'name': re.escape(funcname)}, @@ -615,10 +633,8 @@ def build_function(self, argnames, code, *global_stack): argnames = tuple(argnames) def resf(args, kwargs={}, allow_recursion=100): - global_stack[0].update({ - **dict(itertools.zip_longest(argnames, args, fillvalue=None)), - **kwargs - }) + global_stack[0].update(itertools.zip_longest(argnames, args, fillvalue=None)) + global_stack[0].update(kwargs) var_stack = LocalNameSpace(*global_stack) ret, should_abort = self.interpret_statement(code.replace('\n', ''), var_stack, allow_recursion - 1) if should_abort: From 1e4fca9a87b0ff6b7316261a2f081493af3885b2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 15 Aug 2022 03:15:05 +0530 Subject: [PATCH 046/284] [cleanup] Misc --- Changelog.md | 10 +++++----- Collaborators.md | 9 +++++---- README.md | 6 ++---- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/jixie.py | 17 +++++++---------- yt_dlp/extractor/kompas.py | 2 -- yt_dlp/extractor/mlb.py | 2 +- yt_dlp/extractor/parler.py | 3 --- yt_dlp/extractor/twitch.py | 2 +- yt_dlp/extractor/zattoo.py | 5 +---- 10 files changed, 23 insertions(+), 35 deletions(-) diff --git a/Changelog.md b/Changelog.md index 483c947b60..ad9c00b204 100644 --- a/Changelog.md +++ b/Changelog.md @@ -20,10 +20,10 @@ ### 2022.08.08 * `--compat-option no-live-chat` should disable danmaku * Fix misleading DRM message * Import ctypes only when necessary -* Minor bugfixes by [pukkandan](https://github.com/pukkandan) -* Reject entire playlists faster with `--match-filter` by [pukkandan](https://github.com/pukkandan) +* Minor bugfixes +* Reject entire playlists faster with `--match-filter` * Remove filtered entries from `-J` -* Standardize retry mechanism by [pukkandan](https://github.com/pukkandan) +* Standardize retry mechanism * Validate `--merge-output-format` * [downloader] Add average speed to final progress line * [extractor] Add field `audio_channels` @@ -31,7 +31,7 @@ ### 2022.08.08 * [ffmpeg] Set `ffmpeg_location` in a contextvar * [FFmpegThumbnailsConvertor] Fix conversion from GIF * [MetadataParser] Don't set `None` when the field didn't match -* [outtmpl] Smarter replacing of unsupported characters by [pukkandan](https://github.com/pukkandan) +* [outtmpl] Smarter replacing of unsupported characters * [outtmpl] Treat empty values as None in filenames * [utils] sanitize_open: Allow any IO stream as stdout * [build, devscripts] Add devscript to set a build variant @@ -64,7 +64,7 @@ ### 2022.08.08 * [extractor/bbc] Fix news articles by [ajj8](https://github.com/ajj8) * [extractor/camtasia] Separate into own extractor by [coletdjnz](https://github.com/coletdjnz) * [extractor/cloudflarestream] Fix video_id padding by [haobinliang](https://github.com/haobinliang) -* [extractor/crunchyroll] Fix conversion of thumbnail from GIF by [pukkandan](https://github.com/pukkandan) +* [extractor/crunchyroll] Fix conversion of thumbnail from GIF * [extractor/crunchyroll] Handle missing metadata correctly by [Burve](https://github.com/Burve), [pukkandan](https://github.com/pukkandan) * [extractor/crunchyroll:beta] Extract timestamp and fix tests by [tejing1](https://github.com/tejing1) * [extractor/crunchyroll:beta] Use streams API by [tejing1](https://github.com/tejing1) diff --git a/Collaborators.md b/Collaborators.md index 52e3b9caea..3f24d5c476 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -28,12 +28,12 @@ ## [coletdjnz](https://github.com/coletdjnz) [![gh-sponsor](https://img.shields.io/badge/_-Sponsor-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements -* Added support for downloading YoutubeWebArchive videos -* Added support for new websites MainStreaming, PRX, nzherald, etc +* Added support for new websites YoutubeWebArchive, MainStreaming, PRX, nzherald, Mediaklikk, StarTV etc +* Improved/fixed support for Patreon, panopto, gfycat, itv, pbs, SouthParkDE etc -## [Ashish0804](https://github.com/Ashish0804) +## [Ashish0804](https://github.com/Ashish0804) <sub><sup>[Inactive]</sup></sub> [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/ashish0804) @@ -48,4 +48,5 @@ ## [Lesmiscore](https://github.com/Lesmiscore) (nao20010128nao) **Monacoin**: mona1q3tf7dzvshrhfe3md379xtvt2n22duhglv5dskr * Download live from start to end for YouTube -* Added support for new websites mildom, PixivSketch, skeb, radiko, voicy, mirrativ, openrec, whowatch, damtomo, 17.live, mixch etc +* Added support for new websites AbemaTV, mildom, PixivSketch, skeb, radiko, voicy, mirrativ, openrec, whowatch, damtomo, 17.live, mixch etc +* Improved/fixed support for fc2, YahooJapanNews, tver, iwara etc diff --git a/README.md b/README.md index 42cbfcebac..31793b54e0 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ ### Differences in default behavior * Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` -* youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpful, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior +* yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior For ease of use, a few more compat options are available: @@ -1758,9 +1758,7 @@ #### youtube * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `innertube_host`: Innertube API host to use for all API requests - * E.g. `studio.youtube.com`, `youtubei.googleapis.com` - * Note: Cookies exported from `www.youtube.com` will not work with hosts other than `*.youtube.com` +* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests #### youtubetab (YouTube playlists, channels, feeds, etc.) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 498e8dd8e2..7a2b03cb50 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -301,7 +301,7 @@ class YoutubeDL: should act on each input URL as opposed to for the entire queue cookiefile: File name or text stream from where cookies should be read and dumped to cookiesfrombrowser: A tuple containing the name of the browser, the profile - name/pathfrom where cookies are loaded, and the name of the + name/path from where cookies are loaded, and the name of the keyring, e.g. ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') legacyserverconnect: Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation diff --git a/yt_dlp/extractor/jixie.py b/yt_dlp/extractor/jixie.py index 3bb685e016..7480af0504 100644 --- a/yt_dlp/extractor/jixie.py +++ b/yt_dlp/extractor/jixie.py @@ -1,17 +1,14 @@ from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, - traverse_obj, - try_call, -) - -# more info about jixie: -# [1] https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, -# [2] https://scripts.jixie.media/jxvideo.3.1.min.js +from ..utils import clean_html, float_or_none, traverse_obj, try_call class JixieBaseIE(InfoExtractor): + """ + API Reference: + https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, + https://scripts.jixie.media/jxvideo.3.1.min.js + """ + def _extract_data_from_jixie_id(self, display_id, video_id, webpage): json_data = self._download_json( 'https://apidam.jixie.io/api/public/stream', display_id, diff --git a/yt_dlp/extractor/kompas.py b/yt_dlp/extractor/kompas.py index 03f5f30bd7..8bad961906 100644 --- a/yt_dlp/extractor/kompas.py +++ b/yt_dlp/extractor/kompas.py @@ -1,7 +1,5 @@ from .jixie import JixieBaseIE -# Video from video.kompas.com seems use jixie player - class KompasVideoIE(JixieBaseIE): _VALID_URL = r'https?://video\.kompas\.com/\w+/(?P<id>\d+)/(?P<slug>[\w-]+)' diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 48baecc47a..ab0edbae39 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -325,7 +325,7 @@ def _real_extract(self, url): airings = self._download_json( f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D', video_id)['data']['Airings'] - + formats, subtitles = [], {} for airing in airings: m3u8_url = self._download_json( diff --git a/yt_dlp/extractor/parler.py b/yt_dlp/extractor/parler.py index 5d60134e0c..68a60bc84f 100644 --- a/yt_dlp/extractor/parler.py +++ b/yt_dlp/extractor/parler.py @@ -1,8 +1,5 @@ -import json - from .common import InfoExtractor from .youtube import YoutubeIE - from ..utils import ( clean_html, format_field, diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index a667d6ec2d..975e09c302 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -1169,7 +1169,7 @@ def _real_extract(self, url): 'id': clip.get('id') or video_id, '_old_archive_ids': [make_archive_id(self, old_id)] if old_id else None, 'display_id': video_id, - 'title': clip.get('title') or video_id, + 'title': clip.get('title'), 'formats': formats, 'duration': int_or_none(clip.get('durationSeconds')), 'view_count': int_or_none(clip.get('viewCount')), diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 9ce15b3889..2bd684c7e8 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -2,10 +2,7 @@ from uuid import uuid4 from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError, compat_str from ..utils import ( ExtractorError, int_or_none, From 55937202b72a64f9ca8a877dbb0e1eea401427cc Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 15 Aug 2022 03:43:29 +0530 Subject: [PATCH 047/284] Release 2022.08.14 --- CONTRIBUTORS | 5 +++++ Changelog.md | 31 +++++++++++++++++++++++++++++++ supportedsites.md | 5 +++++ 3 files changed, 41 insertions(+) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index cf9b0ea544..eaf3450405 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -294,3 +294,8 @@ haobinliang Mehavoid winterbird-code yashkc2025 +aldoridhoni +bashonly +jacobtruman +masta79 +palewire diff --git a/Changelog.md b/Changelog.md index ad9c00b204..7d16b8a8fa 100644 --- a/Changelog.md +++ b/Changelog.md @@ -11,6 +11,37 @@ # Instuctions for creating release --> +### 2022.08.14 + +* Merge youtube-dl: Upto [commit/d231b56](https://github.com/ytdl-org/youtube-dl/commit/d231b56) +* [jsinterp] Handle **new youtube signature functions** +* [jsinterp] Truncate error messages +* [extractor] Fix format sorting of `channels` +* [ffmpeg] Disable avconv unless `--prefer-avconv` +* [ffmpeg] Smarter detection of ffprobe filename +* [patreon] Ignore erroneous media attachments by [coletdjnz](https://github.com/coletdjnz) +* [postprocessor/embedthumbnail] Detect `libatomicparsley.so` +* [ThumbnailsConvertor] Fix conversion after `fixup_webp` +* [utils] Fix `get_compatible_ext` +* [build] Fix changelog +* [update] Set executable bit-mask by [pukkandan](https://github.com/pukkandan), [Lesmiscore](https://github.com/Lesmiscore) +* [devscripts] Fix import +* [docs] Consistent use of `e.g.` by [Lesmiscore](https://github.com/Lesmiscore) +* [cleanup] Misc fixes and cleanup +* [extractor/moview] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/parler] Add extractor by [palewire](https://github.com/palewire) +* [extractor/truth] Add extractor by [palewire](https://github.com/palewire) +* [extractor/aenetworks] Add formats parameter by [jacobtruman](https://github.com/jacobtruman) +* [extractor/crunchyroll] Improve `_VALID_URL`s +* [extractor/doodstream] Add `wf` domain by [aldoridhoni](https://github.com/aldoridhoni) +* [extractor/facebook] Add reel support by [bashonly](https://github.com/bashonly) +* [extractor/MLB] New extractor by [ischmidt20](https://github.com/ischmidt20) +* [extractor/rai] Misc fixes by [nixxo](https://github.com/nixxo) +* [extractor/toggo] Improve `_VALID_URL` by [masta79](https://github.com/masta79) +* [extractor/tubitv] Extract additional formats by [shirt-dev](https://github.com/shirt-dev) +* [extractor/zattoo] Potential fix for resellers + + ### 2022.08.08 * **Remove Python 3.6 support** diff --git a/supportedsites.md b/supportedsites.md index e5f808396a..aa1d52b5b3 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -380,6 +380,7 @@ # Supported sites - **ExtremeTube** - **EyedoTV** - **facebook**: [<abbr title="netrc machine"><em>facebook</em></abbr>] + - **facebook:reel** - **FacebookPluginsVideo** - **fancode:live**: [<abbr title="netrc machine"><em>fancode</em></abbr>] - **fancode:vod**: [<abbr title="netrc machine"><em>fancode</em></abbr>] @@ -709,6 +710,7 @@ # Supported sites - **mixcloud:playlist** - **mixcloud:user** - **MLB** + - **MLBTV**: [<abbr title="netrc machine"><em>mlb</em></abbr>] - **MLBVideo** - **MLSSoccer** - **Mnet** @@ -726,6 +728,7 @@ # Supported sites - **MovieClips** - **MovieFap** - **Moviepilot** + - **MoviewPlay** - **Moviezine** - **MovingImage** - **MSN** @@ -916,6 +919,7 @@ # Supported sites - **ParamountNetwork** - **ParamountPlus** - **ParamountPlusSeries** + - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview** - **Patreon** @@ -1314,6 +1318,7 @@ # Supported sites - **TrovoVod** - **TrueID** - **TruNews** + - **Truth** - **TruTV** - **Tube8** - **TubeTuGraz**: [<abbr title="netrc machine"><em>tubetugraz</em></abbr>] tube.tugraz.at From 9fd03a16960918187cea826f241620b8c98d34fb Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Sun, 14 Aug 2022 22:18:33 +0000 Subject: [PATCH 048/284] [version] update Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++++---- .github/ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++++---- yt_dlp/version.py | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index cf2ce93f01..5c54d3c5e4 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -62,7 +62,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -70,8 +70,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.08, Current version: 2022.08.08 - yt-dlp is up to date (2022.08.08) + Latest version: 2022.08.14, Current version: 2022.08.14 + yt-dlp is up to date (2022.08.14) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 8b94a7e9ef..89d59b6f1b 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -74,7 +74,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -82,8 +82,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.08, Current version: 2022.08.08 - yt-dlp is up to date (2022.08.08) + Latest version: 2022.08.14, Current version: 2022.08.14 + yt-dlp is up to date (2022.08.14) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 4c1e1b9235..b2fb774fee 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -70,7 +70,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -78,8 +78,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.08, Current version: 2022.08.08 - yt-dlp is up to date (2022.08.08) + Latest version: 2022.08.14, Current version: 2022.08.14 + yt-dlp is up to date (2022.08.14) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 4d9c6c5799..f30c2cb90f 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -55,7 +55,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -63,8 +63,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.08, Current version: 2022.08.08 - yt-dlp is up to date (2022.08.08) + Latest version: 2022.08.14, Current version: 2022.08.14 + yt-dlp is up to date (2022.08.14) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 4ab6df8062..3f955bd0b5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -51,7 +51,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -59,7 +59,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.08, Current version: 2022.08.08 - yt-dlp is up to date (2022.08.08) + Latest version: 2022.08.14, Current version: 2022.08.14 + yt-dlp is up to date (2022.08.14) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 2cfd49f3da..20e3050331 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.08.08** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.08 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,7 +65,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.08, Current version: 2022.08.08 - yt-dlp is up to date (2022.08.08) + Latest version: 2022.08.14, Current version: 2022.08.14 + yt-dlp is up to date (2022.08.14) <more lines> render: shell diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 955970a2f8..9786ee978f 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.08.08' +__version__ = '2022.08.14' -RELEASE_GIT_HEAD = '3157158f7' +RELEASE_GIT_HEAD = '55937202b' VARIANT = None From 460eb9c50e0970fdceb51485c5fe3268574c48e8 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Mon, 15 Aug 2022 15:43:43 +0900 Subject: [PATCH 049/284] [build] Exclude devscripts from installs Closes #4667 --- pyinst.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyinst.py b/pyinst.py index 9be5d89604..0b7c66a30f 100644 --- a/pyinst.py +++ b/pyinst.py @@ -81,7 +81,7 @@ def version_to_list(version): def dependency_options(): # Due to the current implementation, these are auto-detected, but explicitly add them just in case dependencies = [pycryptodome_module(), 'mutagen', 'brotli', 'certifi', 'websockets'] - excluded_modules = ['test', 'ytdlp_plugins', 'youtube_dl', 'youtube_dlc'] + excluded_modules = ('youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts') yield from (f'--hidden-import={module}' for module in dependencies) yield '--collect-submodules=websockets' diff --git a/setup.py b/setup.py index aebe1dead9..e376a694a3 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def packages(): if setuptools_available: - return find_packages(exclude=('youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins')) + return find_packages(exclude=('youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts')) return [ 'yt_dlp', 'yt_dlp.extractor', 'yt_dlp.downloader', 'yt_dlp.postprocessor', 'yt_dlp.compat', From 5c6d2ef9d1001508407d7825d731013f3cb99f5f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 15 Aug 2022 13:58:39 +0530 Subject: [PATCH 050/284] [youtube] Improve format sorting for IOS formats When no itag/resolution is available for reference, use the closest resolution --- yt_dlp/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5ac481bd76..4f279b36d7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3168,7 +3168,7 @@ def append_client(*client_names): def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration): itags, stream_ids = {}, [] - itag_qualities, res_qualities = {}, {} + itag_qualities, res_qualities = {}, {0: -1} q = qualities([ # Normally tiny is the smallest video-only formats. But # audio-only formats with unknown quality may get tagged as tiny @@ -3320,10 +3320,9 @@ def process_manifest_format(f, proto, itag): f['format_id'] = itag itags[itag] = proto - f['quality'] = next(( - q(qdict[val]) - for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities)) - if val in qdict), -1) + f['quality'] = itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1) + if f['quality'] == -1 and f.get('height'): + f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) return True subtitles = {} From 6d3e7424bfe8cfdbd5931a37519ca7faafff642d Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 16 Aug 2022 06:53:45 +0530 Subject: [PATCH 051/284] [jsinterp] Fix for youtube player c81bbb4a --- test/test_jsinterp.py | 5 +++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 30 +++++++++++++++--------------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c97f6dcfb9..665af4668a 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -212,6 +212,11 @@ def test_comma(self): ''') self.assertEqual(jsi.call_function('x'), 7) + jsi = JSInterpreter(''' + function x() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) } + ''') + self.assertEqual(jsi.call_function('x'), 5) + def test_void(self): jsi = JSInterpreter(''' function x() { return void 42; } diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 79bbfc3237..0ac4fd6028 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -102,6 +102,10 @@ 'https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', 'TDCstCG66tEAO5pR9o', 'dbxNtZ14c-yWyw', ), + ( + 'https://www.youtube.com/s/player/c81bbb4a/player_ias.vflset/en_US/base.js', + 'gre3EcLurNY2vqp94', 'Z9DfGxWP115WTg', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 87f141476c..47cca11761 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -33,19 +33,19 @@ '==': operator.eq, '!=': operator.ne, - '<=': operator.le, - '>=': operator.ge, - '<': operator.lt, - '>': operator.gt, + '<=': lambda a, b: (a or 0) <= (b or 0), + '>=': lambda a, b: (a or 0) >= (b or 0), + '<': lambda a, b: (a or 0) < (b or 0), + '>': lambda a, b: (a or 0) > (b or 0), '>>': operator.rshift, '<<': operator.lshift, - '+': operator.add, - '-': operator.sub, + '+': lambda a, b: (a or 0) + (b or 0), + '-': lambda a, b: (a or 0) - (b or 0), - '*': operator.mul, - '/': operator.truediv, + '*': lambda a, b: (a or 0) * (b or 0), + '/': lambda a, b: (a or 0) / b, '%': operator.mod, '**': operator.pow, @@ -339,11 +339,12 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): # Comma separated statements sub_expressions = list(self._separate(expr)) - expr = sub_expressions.pop().strip() if sub_expressions else '' - for sub_expr in sub_expressions: - ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) - if should_abort: - return ret, True + if len(sub_expressions) > 1: + for sub_expr in sub_expressions: + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + return ret, False for m in re.finditer(rf'''(?x) (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})| @@ -422,8 +423,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if not separated: continue left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) - return self._operator(op, 0 if left_val is None else left_val, - right_expr, expr, local_vars, allow_recursion), should_return + return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return if m and m.group('attribute'): variable = m.group('var') From c200096c031ac6f86f2ceb3792601ab0b33439ea Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 16 Aug 2022 22:00:51 +0530 Subject: [PATCH 052/284] Fix bug in --download-archive Closes #4668 --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7a2b03cb50..7f6dc6027b 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3443,7 +3443,7 @@ def in_download_archive(self, info_dict): return False vid_ids = [self._make_archive_id(info_dict)] - vid_ids.extend(info_dict.get('_old_archive_ids', [])) + vid_ids.extend(info_dict.get('_old_archive_ids') or []) return any(id_ in self.archive for id_ in vid_ids) def record_download_archive(self, info_dict): From 3ce2933693b66e5e8948352609c8258d8d2cec15 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 16 Aug 2022 22:01:48 +0530 Subject: [PATCH 053/284] [youtube] Fix error reporting of "Incomplete data" Related: #4669 --- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4f279b36d7..12634483e6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -809,7 +809,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers # Youtube sometimes sends incomplete data # See: https://github.com/ytdl-org/youtube-dl/issues/28194 if not traverse_obj(response, *variadic(check_get_keys)): - retry.error = ExtractorError('Incomplete data received') + retry.error = ExtractorError('Incomplete data received', expected=True) continue return response diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index db355ec92a..49ee228650 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5764,7 +5764,7 @@ def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffi if not count: return warn(e) elif isinstance(e, ExtractorError): - e = remove_end(str(e.cause) or e.orig_msg, '.') + e = remove_end(str_or_none(e.cause) or e.orig_msg, '.') warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...') delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func From f6ca640b122239d5ab215f8c2564efb7ac3e8c65 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 18 Aug 2022 16:38:35 +0530 Subject: [PATCH 054/284] [jsinterp] Fix for youtube player 1f7d5369 Closes #4635 again --- test/test_youtube_signature.py | 4 +++ yt_dlp/extractor/youtube.py | 9 +++-- yt_dlp/jsinterp.py | 66 +++++++++++++++++++++++++--------- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 0ac4fd6028..f1859a2fc6 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -106,6 +106,10 @@ 'https://www.youtube.com/s/player/c81bbb4a/player_ias.vflset/en_US/base.js', 'gre3EcLurNY2vqp94', 'Z9DfGxWP115WTg', ), + ( + 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', + 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 12634483e6..795a4f42fa 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2652,9 +2652,14 @@ def _extract_n_function(self, video_id, player_url): if self.get_param('youtube_print_sig_code'): self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') - func = jsi.extract_function_from_code(*func_code) - return lambda s: func([s]) + + def inner(s): + ret = func([s]) + if ret.startswith('enhanced_except_'): + raise ExtractorError('Signature function returned an exception') + return ret + return inner def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 47cca11761..d3994e90c2 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -24,9 +24,9 @@ '||': None, '&&': None, - '&': operator.and_, - '|': operator.or_, - '^': operator.xor, + '&': lambda a, b: (a or 0) & (b or 0), + '|': lambda a, b: (a or 0) | (b or 0), + '^': lambda a, b: (a or 0) ^ (b or 0), '===': operator.is_, '!==': operator.is_not, @@ -45,8 +45,8 @@ '-': lambda a, b: (a or 0) - (b or 0), '*': lambda a, b: (a or 0) * (b or 0), - '/': lambda a, b: (a or 0) / b, - '%': operator.mod, + '/': lambda a, b: (a or 0) / b if b else float('NaN'), + '%': lambda a, b: (a or 0) % b if b else float('NaN'), '**': operator.pow, } @@ -54,7 +54,7 @@ _COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} _MATCHING_PARENS = dict(zip('({[', ')}]')) -_QUOTES = '\'"' +_QUOTES = '\'"/' def _ternary(cndn, if_true=True, if_false=False): @@ -77,6 +77,12 @@ def __init__(self): ExtractorError.__init__(self, 'Invalid continue') +class JS_Throw(ExtractorError): + def __init__(self, e): + self.error = e + ExtractorError.__init__(self, f'Uncaught exception {e}') + + class LocalNameSpace(collections.ChainMap): def __setitem__(self, key, value): for scope in self.maps: @@ -131,19 +137,24 @@ def _named_object(self, namespace, obj): @staticmethod def _separate(expr, delim=',', max_split=None): + OP_CHARS = '+-*/%&|^=<>!,;' if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 - in_quote, escaping = None, False + in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 elif not in_quote and char in counters: counters[char] -= 1 elif not escaping and char in _QUOTES and in_quote in (char, None): - in_quote = None if in_quote else char + if in_quote or after_op or char != '/': + in_quote = None if in_quote and not in_regex_char_group else char + elif in_quote == '/' and char in '[]': + in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' + after_op = not in_quote and char in OP_CHARS or (char == ' ' and after_op) if char != delim[pos] or any(counters.values()) or in_quote: pos = 0 @@ -210,16 +221,22 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if should_return: return ret, should_return - m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|$)', stmt) + m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt) if m: expr = stmt[len(m.group(0)):].strip() + if m.group('throw'): + raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion)) should_return = not m.group('var') if not expr: return None, should_return if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) - inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) + if expr[0] == '/': + inner = inner[1:].replace('"', R'\"') + inner = re.compile(json.loads(js_to_json(f'"{inner}"', strict=True))) + else: + inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) if not outer: return inner, should_return expr = self._named_object(local_vars, inner) + outer @@ -263,21 +280,36 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'(?P<try>try|finally)\s*|(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr) + m = re.match(rf'''(?x) + (?P<try>try|finally)\s*| + (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))| + (?P<switch>switch)\s*\(| + (?P<for>for)\s*\(|''', expr) if m and m.group('try'): if expr[m.end()] == '{': try_expr, expr = self._separate_at_paren(expr[m.end():], '}') else: try_expr, expr = expr[m.end() - 1:], '' - ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) - if should_abort: - return ret, True + try: + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + except JS_Throw as e: + local_vars['__ytdlp_exception__'] = e.error + except Exception as e: + # XXX: This works for now, but makes debugging future issues very hard + local_vars['__ytdlp_exception__'] = e ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return elif m and m.group('catch'): - # We ignore the catch block - _, expr = self._separate_at_paren(expr, '}') + catch_expr, expr = self._separate_at_paren(expr[m.end():], '}') + if '__ytdlp_exception__' in local_vars: + catch_vars = local_vars.new_child({m.group('err'): local_vars.pop('__ytdlp_exception__')}) + ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion) + if should_abort: + return ret, True + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return @@ -390,7 +422,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): raise self.Exception(f'List index {idx} must be integer', expr) idx = int(idx) left_val[idx] = self._operator( - m.group('op'), left_val[idx], m.group('expr'), expr, local_vars, allow_recursion) + m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) return left_val[idx], should_return elif expr.isdigit(): From 2f1a299c50559ac2ac8c159c8df83fcc4940cfa7 Mon Sep 17 00:00:00 2001 From: ChillingPepper <90042155+ChillingPepper@users.noreply.github.com> Date: Thu, 18 Aug 2022 13:14:45 +0200 Subject: [PATCH 055/284] [extractor/SovietsCloset] Fix extractor (#4688) Closes #4200 Authored by: ChillingPepper --- yt_dlp/extractor/sovietscloset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index fc5a492a63..f1243cc492 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -44,7 +44,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): _TESTS = [ { 'url': 'https://sovietscloset.com/video/1337', - 'md5': '11e58781c4ca5b283307aa54db5b3f93', + 'md5': 'bd012b04b261725510ca5383074cdd55', 'info_dict': { 'id': '1337', 'ext': 'mp4', @@ -69,11 +69,11 @@ class SovietsClosetIE(SovietsClosetBaseIE): }, { 'url': 'https://sovietscloset.com/video/1105', - 'md5': '578b1958a379e7110ba38697042e9efb', + 'md5': '89fa928f183893cb65a0b7be846d8a90', 'info_dict': { 'id': '1105', 'ext': 'mp4', - 'title': 'Arma 3 - Zeus Games #3', + 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', 'thumbnail': r're:^https?://.*\.b-cdn\.net/c0e5e76f-3a93-40b4-bf01-12343c2eec5d/thumbnail\.jpg$', 'uploader': 'SovietWomble', @@ -89,8 +89,8 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'availability': 'public', 'series': 'Arma 3', 'season': 'Zeus Games', - 'episode_number': 3, - 'episode': 'Episode 3', + 'episode_number': 5, + 'episode': 'Episode 5', }, }, ] @@ -122,7 +122,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase') + static_assets_base = self._search_regex(r'(/_nuxt/static/\d+)', webpage, 'staticAssetsBase') static_assets_base = f'https://sovietscloset.com{static_assets_base}' stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream'] @@ -181,7 +181,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) - static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase') + static_assets_base = self._search_regex(r'(/_nuxt/static/\d+)', webpage, 'staticAssetsBase') static_assets_base = f'https://sovietscloset.com{static_assets_base}' sovietscloset = self.parse_nuxt_jsonp(f'{static_assets_base}/payload.js', playlist_id, 'global')['games'] From 580ce007827e208edd1a72278c0b799cbb3bc251 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 18 Aug 2022 21:27:41 +0530 Subject: [PATCH 056/284] [youtube] Improve signature caching and refactor related functions --- yt_dlp/extractor/youtube.py | 128 +++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 795a4f42fa..a642f0705d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2512,20 +2512,17 @@ def _extract_signature_function(self, video_id, player_url, example_sig): assert os.path.basename(func_id) == func_id self.write_debug(f'Extracting signature function {func_id}') - cache_spec = self.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None - code = self._load_player(video_id, player_url) + if not cache_spec: + code = self._load_player(video_id, player_url) if code: res = self._parse_sig_js(code) - test_string = ''.join(map(chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - + cache_spec = [ord(c) for c in res(test_string)] self.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res + + return lambda s: ''.join(s[i] for i in cache_spec) def _print_sig_code(self, func, example_sig): if not self.get_param('youtube_print_sig_code'): @@ -2593,18 +2590,29 @@ def _parse_sig_js(self, jscode): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) + def _cached(self, func, *cache_id): + def inner(*args, **kwargs): + if cache_id not in self._player_cache: + try: + self._player_cache[cache_id] = func(*args, **kwargs) + except ExtractorError as e: + self._player_cache[cache_id] = e + except Exception as e: + self._player_cache[cache_id] = ExtractorError(traceback.format_exc(), cause=e) + + ret = self._player_cache[cache_id] + if isinstance(ret, Exception): + raise ret + return ret + return inner + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function(video_id, player_url, s) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - self._print_sig_code(func, s) - return func(s) - except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + extract_sig = self._cached( + self._extract_signature_function, 'sig', player_url, self._signature_cache_id(s)) + func = extract_sig(video_id, player_url, s) + self._print_sig_code(func, s) + return func(s) def _decrypt_nsig(self, s, video_id, player_url): """Turn the encrypted n field into a working signature""" @@ -2612,54 +2620,47 @@ def _decrypt_nsig(self, s, video_id, player_url): raise ExtractorError('Cannot decrypt nsig without player_url') player_url = urljoin('https://www.youtube.com', player_url) - sig_id = ('nsig_value', s) - if sig_id in self._player_cache: - return self._player_cache[sig_id] - - try: - player_id = ('nsig', player_url) - if player_id not in self._player_cache: - self._player_cache[player_id] = self._extract_n_function(video_id, player_url) - func = self._player_cache[player_id] - self._player_cache[sig_id] = func(s) - self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}') - return self._player_cache[sig_id] - except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) - - def _extract_n_function_name(self, jscode): - nfunc, idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', - jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) - if not idx: - return nfunc - return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode, - f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)] - - def _extract_n_function(self, video_id, player_url): - player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id) - - if func_code: - jsi = JSInterpreter(func_code) - else: - jscode = self._load_player(video_id, player_url) - funcname = self._extract_n_function_name(jscode) - jsi = JSInterpreter(jscode) - func_code = jsi.extract_function_code(funcname) - self.cache.store('youtube-nsig', player_id, func_code) - + jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) if self.get_param('youtube_print_sig_code'): self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') + + extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) + ret = extract_nsig(jsi, func_code)(s) + + self.write_debug(f'Decrypted nsig {s} => {ret}') + return ret + + def _extract_n_function_code(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self.cache.load('youtube-nsig', player_id) + jscode = func_code or self._load_player(video_id, player_url) + jsi = JSInterpreter(jscode) + + if func_code: + return jsi, player_id, func_code + + funcname, idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', + jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) + if idx: + funcname = json.loads(js_to_json(self._search_regex( + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, + f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] + + func_code = jsi.extract_function_code(funcname) + self.cache.store('youtube-nsig', player_id, func_code) + return jsi, player_id, func_code + + def _extract_n_function_from_code(self, jsi, func_code): func = jsi.extract_function_from_code(*func_code) - def inner(s): + def extract_nsig(s): ret = func([s]) if ret.startswith('enhanced_except_'): raise ExtractorError('Signature function returned an exception') return ret - return inner + + return extract_nsig def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ @@ -3225,7 +3226,8 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i self._decrypt_signature(encrypted_sig, video_id, player_url) ) except ExtractorError as e: - self.report_warning('Signature extraction failed: Some formats may be missing', only_once=True) + self.report_warning('Signature extraction failed: Some formats may be missing', + video_id=video_id, only_once=True) self.write_debug(e, only_once=True) continue @@ -3233,12 +3235,14 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i throttled = False if query.get('n'): try: + decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0]) fmt_url = update_url_query(fmt_url, { - 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)}) + 'n': decrypt_nsig(query['n'][0], video_id, player_url) + }) except ExtractorError as e: self.report_warning( 'nsig extraction failed: You may experience throttling for some formats\n' - f'n = {query["n"][0]} ; player = {player_url}', only_once=True) + f'n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) self.write_debug(e, only_once=True) throttled = True From 587021cd9f717181b44e881941aca3f8d753758b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 18 Aug 2022 21:34:47 +0530 Subject: [PATCH 057/284] [phantomjs] Add function to execute JS without a DOM Authored by: MinePlayersPE, pukkandan --- yt_dlp/extractor/openload.py | 62 ++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index f12a0eff11..e66ed4831b 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -1,3 +1,4 @@ +import collections import contextlib import json import os @@ -9,8 +10,10 @@ ExtractorError, Popen, check_executable, + format_field, get_exe_version, is_outdated_version, + shell_quote, ) @@ -49,7 +52,7 @@ class PhantomJSwrapper: This class is experimental. """ - _TEMPLATE = r''' + _BASE_JS = R''' phantom.onError = function(msg, trace) {{ var msgStack = ['PHANTOM ERROR: ' + msg]; if(trace && trace.length) {{ @@ -62,6 +65,9 @@ class PhantomJSwrapper: console.error(msgStack.join('\n')); phantom.exit(1); }}; + ''' + + _TEMPLATE = R''' var page = require('webpage').create(); var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; @@ -116,14 +122,18 @@ def __init__(self, extractor, required_version=None, timeout=10000): 'Your copy of PhantomJS is outdated, update it to version ' '%s or newer if you encounter any errors.' % required_version) - self.options = { - 'timeout': timeout, - } for name in self._TMP_FILE_NAMES: tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() self._TMP_FILES[name] = tmp + self.options = collections.ChainMap({ + 'timeout': timeout, + }, { + x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + for x in self._TMP_FILE_NAMES + }) + def __del__(self): for name in self._TMP_FILE_NAMES: with contextlib.suppress(OSError, KeyError): @@ -194,31 +204,35 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w self._save_cookies(url) - replaces = self.options - replaces['url'] = url user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] - replaces['ua'] = user_agent.replace('"', '\\"') - replaces['jscode'] = jscode + jscode = self._TEMPLATE.format_map(self.options.new_child({ + 'url': url, + 'ua': user_agent.replace('"', '\\"'), + 'jscode': jscode, + })) - for x in self._TMP_FILE_NAMES: - replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + stdout = self.execute(jscode, video_id, note2) - with open(self._TMP_FILES['script'].name, 'wb') as f: - f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) - - if video_id is None: - self.extractor.to_screen(f'{note2}') - else: - self.extractor.to_screen(f'{video_id}: {note2}') - - stdout, stderr, returncode = Popen.run( - [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name], - text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if returncode: - raise ExtractorError(f'Executing JS failed:\n{stderr}') with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') - self._load_cookies() return html, stdout + + def execute(self, jscode, video_id=None, note='Executing JS'): + """Execute JS and return stdout""" + if 'phantom.exit();' not in jscode: + jscode += ';\nphantom.exit();' + jscode = self._BASE_JS + jscode + + with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f: + f.write(jscode) + self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + + cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] + self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') + stdout, stderr, returncode = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if returncode: + raise ExtractorError(f'Executing JS failed:\n{stderr.strip()}') + + return stdout From 25836db6bea78501c514bfbe5840f305b33afdcd Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 18 Aug 2022 21:35:18 +0530 Subject: [PATCH 058/284] [extractor/youtube] Add fallback to phantomjs Related #4635 --- yt_dlp/extractor/youtube.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a642f0705d..c624d8c8c0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -17,6 +17,7 @@ import urllib.parse from .common import InfoExtractor, SearchInfoExtractor +from .openload import PhantomJSwrapper from ..compat import functools from ..jsinterp import JSInterpreter from ..utils import ( @@ -2624,8 +2625,23 @@ def _decrypt_nsig(self, s, video_id, player_url): if self.get_param('youtube_print_sig_code'): self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') - extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) - ret = extract_nsig(jsi, func_code)(s) + try: + extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) + ret = extract_nsig(jsi, func_code)(s) + except JSInterpreter.Exception as e: + try: + jsi = PhantomJSwrapper(self) + except ExtractorError: + raise e + self.report_warning( + f'Native nsig extraction failed: Trying with PhantomJS\n' + f' n = {s} ; player = {player_url}', video_id) + self.write_debug(e) + + args, func_body = func_code + ret = jsi.execute( + f'console.log(function({", ".join(args)}) {{ {func_body} }}({s!r}));', + video_id=video_id, note='Executing signature code').strip() self.write_debug(f'Decrypted nsig {s} => {ret}') return ret @@ -2655,9 +2671,15 @@ def _extract_n_function_from_code(self, jsi, func_code): func = jsi.extract_function_from_code(*func_code) def extract_nsig(s): - ret = func([s]) + try: + ret = func([s]) + except JSInterpreter.Exception: + raise + except Exception as e: + raise JSInterpreter.Exception(traceback.format_exc(), cause=e) + if ret.startswith('enhanced_except_'): - raise ExtractorError('Signature function returned an exception') + raise JSInterpreter.Exception('Signature function returned an exception') return ret return extract_nsig @@ -3240,9 +3262,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i 'n': decrypt_nsig(query['n'][0], video_id, player_url) }) except ExtractorError as e: + phantomjs_hint = '' + if isinstance(e, JSInterpreter.Exception): + phantomjs_hint = f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} to workaround the issue\n' self.report_warning( - 'nsig extraction failed: You may experience throttling for some formats\n' - f'n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) + f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' + f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) self.write_debug(e, only_once=True) throttled = True From f60ef66371825c9f0718817d60ff79e4b2abc52a Mon Sep 17 00:00:00 2001 From: Alexander Seiler <seileralex@gmail.com> Date: Thu, 18 Aug 2022 21:57:51 +0200 Subject: [PATCH 059/284] [extractor/zattoo] Fix Zattoo resellers (#4675) Closes #4630 Authored by: goggle --- yt_dlp/extractor/_extractors.py | 26 +- yt_dlp/extractor/zattoo.py | 512 ++++++++++++++++++++++++++++---- 2 files changed, 481 insertions(+), 57 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d70302548e..1a355b2dc3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2200,17 +2200,41 @@ from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, + BBVTVLiveIE, + BBVTVRecordingsIE, EinsUndEinsTVIE, + EinsUndEinsTVLiveIE, + EinsUndEinsTVRecordingsIE, EWETVIE, + EWETVLiveIE, + EWETVRecordingsIE, GlattvisionTVIE, + GlattvisionTVLiveIE, + GlattvisionTVRecordingsIE, MNetTVIE, - NetPlusIE, + MNetTVLiveIE, + MNetTVRecordingsIE, + NetPlusTVIE, + NetPlusTVLiveIE, + NetPlusTVRecordingsIE, OsnatelTVIE, + OsnatelTVLiveIE, + OsnatelTVRecordingsIE, QuantumTVIE, + QuantumTVLiveIE, + QuantumTVRecordingsIE, SaltTVIE, + SaltTVLiveIE, + SaltTVRecordingsIE, SAKTVIE, + SAKTVLiveIE, + SAKTVRecordingsIE, VTXTVIE, + VTXTVLiveIE, + VTXTVRecordingsIE, WalyTVIE, + WalyTVLiveIE, + WalyTVRecordingsIE, ZattooIE, ZattooLiveIE, ZattooMoviesIE, diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 2bd684c7e8..1e38812aad 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -236,32 +236,24 @@ def _extract_ondemand(self, ondemand_id): def _real_extract(self, url): video_id, record_id = self._match_valid_url(url).groups() - return self._extract_video(video_id, record_id) + return getattr(self, f'_extract_{self._TYPE}')(video_id or record_id) -def _make_valid_url(host): - return rf'https?://(?:www\.)?{re.escape(host)}/watch/[^/]+?/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' +def _create_valid_url(host, match, qs, base_re=None): + match_base = fr'|{base_re}/(?P<vid1>{match})' if base_re else '(?P<vid1>)' + return rf'''(?x)https?://(?:www\.)?{re.escape(host)}/(?: + [^?#]+\?(?:[^#]+&)?{qs}=(?P<vid2>{match}) + {match_base} + )''' class ZattooBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'zattoo' _HOST = 'zattoo.com' - @staticmethod - def _create_valid_url(match, qs, base_re=None): - match_base = fr'|{base_re}/(?P<vid1>{match})' if base_re else '(?P<vid1>)' - return rf'''(?x)https?://(?:www\.)?zattoo\.com/(?: - [^?#]+\?(?:[^#]+&)?{qs}=(?P<vid2>{match}) - {match_base} - )''' - - def _real_extract(self, url): - vid1, vid2 = self._match_valid_url(url).group('vid1', 'vid2') - return getattr(self, f'_extract_{self._TYPE}')(vid1 or vid2) - class ZattooIE(ZattooBaseIE): - _VALID_URL = ZattooBaseIE._create_valid_url(r'\d+', 'program', '(?:program|watch)/[^/]+') + _VALID_URL = _create_valid_url(ZattooBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') _TYPE = 'video' _TESTS = [{ 'url': 'https://zattoo.com/program/zdf/250170418', @@ -288,7 +280,7 @@ class ZattooIE(ZattooBaseIE): class ZattooLiveIE(ZattooBaseIE): - _VALID_URL = ZattooBaseIE._create_valid_url(r'[^/?&#]+', 'channel', 'live') + _VALID_URL = _create_valid_url(ZattooBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') _TYPE = 'live' _TESTS = [{ 'url': 'https://zattoo.com/channels/german?channel=srf_zwei', @@ -304,7 +296,7 @@ def suitable(cls, url): class ZattooMoviesIE(ZattooBaseIE): - _VALID_URL = ZattooBaseIE._create_valid_url(r'\w+', 'movie_id', 'vod/movies') + _VALID_URL = _create_valid_url(ZattooBaseIE._HOST, r'\w+', 'movie_id', 'vod/movies') _TYPE = 'ondemand' _TESTS = [{ 'url': 'https://zattoo.com/vod/movies/7521', @@ -316,7 +308,7 @@ class ZattooMoviesIE(ZattooBaseIE): class ZattooRecordingsIE(ZattooBaseIE): - _VALID_URL = ZattooBaseIE._create_valid_url(r'\d+', 'recording') + _VALID_URL = _create_valid_url('zattoo.com', r'\d+', 'recording') _TYPE = 'record' _TESTS = [{ 'url': 'https://zattoo.com/recordings?recording=193615508', @@ -327,139 +319,547 @@ class ZattooRecordingsIE(ZattooBaseIE): }] -class NetPlusIE(ZattooPlatformBaseIE): - _NETRC_MACHINE = 'netplus' +class NetPlusTVBaseIE(ZattooPlatformBaseIE): + _NETRC = 'netplus' _HOST = 'netplus.tv' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(_HOST) + +class NetPlusTVIE(NetPlusTVBaseIE): + _VALID_URL = _create_valid_url(NetPlusTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.netplus.tv/watch/abc/123-abc', + 'url': 'https://netplus.tv/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://netplus.tv/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class MNetTVIE(ZattooPlatformBaseIE): +class NetPlusTVLiveIE(NetPlusTVBaseIE): + _VALID_URL = _create_valid_url(NetPlusTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://netplus.tv/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://netplus.tv/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if NetPlusTVIE.suitable(url) else super().suitable(url) + + +class NetPlusTVRecordingsIE(NetPlusTVBaseIE): + _VALID_URL = _create_valid_url(NetPlusTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://netplus.tv/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://netplus.tv/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class MNetTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'mnettv' _HOST = 'tvplus.m-net.de' - _VALID_URL = _make_valid_url(_HOST) + +class MNetTVIE(MNetTVBaseIE): + _VALID_URL = _create_valid_url(MNetTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', + 'url': 'https://tvplus.m-net.de/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tvplus.m-net.de/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class WalyTVIE(ZattooPlatformBaseIE): +class MNetTVLiveIE(MNetTVBaseIE): + _VALID_URL = _create_valid_url(MNetTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tvplus.m-net.de/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MNetTVIE.suitable(url) else super().suitable(url) + + +class MNetTVRecordingsIE(MNetTVBaseIE): + _VALID_URL = _create_valid_url(MNetTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tvplus.m-net.de/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class WalyTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'walytv' _HOST = 'player.waly.tv' - _VALID_URL = _make_valid_url(_HOST) + +class WalyTVIE(WalyTVBaseIE): + _VALID_URL = _create_valid_url(WalyTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://player.waly.tv/watch/abc/123-abc', + 'url': 'https://player.waly.tv/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://player.waly.tv/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class BBVTVIE(ZattooPlatformBaseIE): +class WalyTVLiveIE(WalyTVBaseIE): + _VALID_URL = _create_valid_url(WalyTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://player.waly.tv/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://player.waly.tv/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if WalyTVIE.suitable(url) else super().suitable(url) + + +class WalyTVRecordingsIE(WalyTVBaseIE): + _VALID_URL = _create_valid_url(WalyTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://player.waly.tv/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://player.waly.tv/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class BBVTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'bbvtv' _HOST = 'bbv-tv.net' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(_HOST) + +class BBVTVIE(BBVTVBaseIE): + _VALID_URL = _create_valid_url(BBVTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', + 'url': 'https://bbv-tv.net/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://bbv-tv.net/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class VTXTVIE(ZattooPlatformBaseIE): +class BBVTVLiveIE(BBVTVBaseIE): + _VALID_URL = _create_valid_url(BBVTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://bbv-tv.net/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://bbv-tv.net/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if BBVTVIE.suitable(url) else super().suitable(url) + + +class BBVTVRecordingsIE(BBVTVBaseIE): + _VALID_URL = _create_valid_url(BBVTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://bbv-tv.net/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://bbv-tv.net/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class VTXTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'vtxtv' _HOST = 'vtxtv.ch' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(_HOST) + +class VTXTVIE(VTXTVBaseIE): + _VALID_URL = _create_valid_url(VTXTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', + 'url': 'https://vtxtv.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://vtxtv.ch/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class GlattvisionTVIE(ZattooPlatformBaseIE): +class VTXTVLiveIE(VTXTVBaseIE): + _VALID_URL = _create_valid_url(VTXTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://vtxtv.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://vtxtv.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if VTXTVIE.suitable(url) else super().suitable(url) + + +class VTXTVRecordingsIE(VTXTVBaseIE): + _VALID_URL = _create_valid_url(VTXTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://vtxtv.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://vtxtv.ch/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class GlattvisionTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'glattvisiontv' _HOST = 'iptv.glattvision.ch' - _VALID_URL = _make_valid_url(_HOST) + +class GlattvisionTVIE(GlattvisionTVBaseIE): + _VALID_URL = _create_valid_url(GlattvisionTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', + 'url': 'https://iptv.glattvision.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://iptv.glattvision.ch/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class SAKTVIE(ZattooPlatformBaseIE): +class GlattvisionTVLiveIE(GlattvisionTVBaseIE): + _VALID_URL = _create_valid_url(GlattvisionTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://iptv.glattvision.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if GlattvisionTVIE.suitable(url) else super().suitable(url) + + +class GlattvisionTVRecordingsIE(GlattvisionTVBaseIE): + _VALID_URL = _create_valid_url(GlattvisionTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://iptv.glattvision.ch/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class SAKTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'saktv' _HOST = 'saktv.ch' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(_HOST) + +class SAKTVIE(SAKTVBaseIE): + _VALID_URL = _create_valid_url(SAKTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.saktv.ch/watch/abc/123-abc', + 'url': 'https://saktv.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://saktv.ch/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class EWETVIE(ZattooPlatformBaseIE): +class SAKTVLiveIE(SAKTVBaseIE): + _VALID_URL = _create_valid_url(SAKTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://saktv.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://saktv.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SAKTVIE.suitable(url) else super().suitable(url) + + +class SAKTVRecordingsIE(SAKTVBaseIE): + _VALID_URL = _create_valid_url(SAKTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://saktv.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://saktv.ch/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class EWETVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'ewetv' _HOST = 'tvonline.ewe.de' - _VALID_URL = _make_valid_url(_HOST) + +class EWETVIE(EWETVBaseIE): + _VALID_URL = _create_valid_url(EWETVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', + 'url': 'https://tvonline.ewe.de/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tvonline.ewe.de/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class QuantumTVIE(ZattooPlatformBaseIE): +class EWETVLiveIE(EWETVBaseIE): + _VALID_URL = _create_valid_url(EWETVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tvonline.ewe.de/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if EWETVIE.suitable(url) else super().suitable(url) + + +class EWETVRecordingsIE(EWETVBaseIE): + _VALID_URL = _create_valid_url(EWETVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tvonline.ewe.de/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class QuantumTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'quantumtv' _HOST = 'quantum-tv.com' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(_HOST) + +class QuantumTVIE(QuantumTVBaseIE): + _VALID_URL = _create_valid_url(QuantumTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', + 'url': 'https://quantum-tv.com/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://quantum-tv.com/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class OsnatelTVIE(ZattooPlatformBaseIE): +class QuantumTVLiveIE(QuantumTVBaseIE): + _VALID_URL = _create_valid_url(QuantumTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://quantum-tv.com/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://quantum-tv.com/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if QuantumTVIE.suitable(url) else super().suitable(url) + + +class QuantumTVRecordingsIE(QuantumTVBaseIE): + _VALID_URL = _create_valid_url(QuantumTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://quantum-tv.com/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://quantum-tv.com/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class OsnatelTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'osnateltv' _HOST = 'tvonline.osnatel.de' - _VALID_URL = _make_valid_url(_HOST) + +class OsnatelTVIE(OsnatelTVBaseIE): + _VALID_URL = _create_valid_url(OsnatelTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', + 'url': 'https://tvonline.osnatel.de/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tvonline.osnatel.de/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class EinsUndEinsTVIE(ZattooPlatformBaseIE): +class OsnatelTVLiveIE(OsnatelTVBaseIE): + _VALID_URL = _create_valid_url(OsnatelTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tvonline.osnatel.de/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if OsnatelTVIE.suitable(url) else super().suitable(url) + + +class OsnatelTVRecordingsIE(OsnatelTVBaseIE): + _VALID_URL = _create_valid_url(OsnatelTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tvonline.osnatel.de/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class EinsUndEinsTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = '1und1tv' _HOST = '1und1.tv' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(_HOST) + +class EinsUndEinsTVIE(EinsUndEinsTVBaseIE): + _VALID_URL = _create_valid_url(EinsUndEinsTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.1und1.tv/watch/abc/123-abc', + 'url': 'https://1und1.tv/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://1und1.tv/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class SaltTVIE(ZattooPlatformBaseIE): +class EinsUndEinsTVLiveIE(EinsUndEinsTVBaseIE): + _VALID_URL = _create_valid_url(EinsUndEinsTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://1und1.tv/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://1und1.tv/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if EinsUndEinsTVIE.suitable(url) else super().suitable(url) + + +class EinsUndEinsTVRecordingsIE(EinsUndEinsTVBaseIE): + _VALID_URL = _create_valid_url(EinsUndEinsTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://1und1.tv/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://1und1.tv/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class SaltTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'salttv' _HOST = 'tv.salt.ch' - _VALID_URL = _make_valid_url(_HOST) + +class SaltTVIE(SaltTVBaseIE): + _VALID_URL = _create_valid_url(SaltTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://tv.salt.ch/watch/abc/123-abc', + 'url': 'https://tv.salt.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tv.salt.ch/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class SaltTVLiveIE(SaltTVBaseIE): + _VALID_URL = _create_valid_url(SaltTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tv.salt.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tv.salt.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SaltTVIE.suitable(url) else super().suitable(url) + + +class SaltTVRecordingsIE(SaltTVBaseIE): + _VALID_URL = _create_valid_url(SaltTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://tv.salt.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tv.salt.ch/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] From 2b3e43e2479511974815fba247393560183691ad Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 18 Aug 2022 15:12:04 -0500 Subject: [PATCH 060/284] [extractor/rtbf] Fix stream extractor (#4671) Closes #4656 Authored by: elyse0 --- yt_dlp/extractor/redbee.py | 43 +++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/redbee.py b/yt_dlp/extractor/redbee.py index dc8b272fc0..89a10448e1 100644 --- a/yt_dlp/extractor/redbee.py +++ b/yt_dlp/extractor/redbee.py @@ -69,6 +69,10 @@ def _get_formats_and_subtitles(self, asset_id, **kwargs): fmts, subs = self._extract_m3u8_formats_and_subtitles( format['mediaLocator'], asset_id, fatal=False) + if format.get('drm'): + for f in fmts: + f['has_drm'] = True + formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) @@ -269,8 +273,17 @@ def _real_extract(self, url): embed_page = self._download_webpage( 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), media_id, query={'id': media_id}) - data = self._parse_json(self._html_search_regex( - r'data-media="([^"]+)"', embed_page, 'media data'), media_id) + + media_data = self._html_search_regex(r'data-media="([^"]+)"', embed_page, 'media data', fatal=False) + if not media_data: + if re.search(r'<div[^>]+id="js-error-expired"[^>]+class="(?![^"]*hidden)', embed_page): + raise ExtractorError('Livestream has ended.', expected=True) + if re.search(r'<div[^>]+id="js-sso-connect"[^>]+class="(?![^"]*hidden)', embed_page): + self.raise_login_required() + + raise ExtractorError('Could not find media data') + + data = self._parse_json(media_data, media_id) error = data.get('error') if error: @@ -280,15 +293,20 @@ def _real_extract(self, url): if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) - title = data['subtitle'] + title = traverse_obj(data, 'subtitle', 'title') is_live = data.get('isLive') height_re = r'-(\d+)p\.' - formats = [] + formats, subtitles = [], {} - m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + # The old api still returns m3u8 and mpd manifest for livestreams, but these are 'fake' + # since all they contain is a 20s video that is completely unrelated. + # https://github.com/yt-dlp/yt-dlp/issues/4656#issuecomment-1214461092 + m3u8_url = None if data.get('isLive') else traverse_obj(data, 'urlHlsAes128', 'urlHls') if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x http_url = data.get('url') @@ -319,10 +337,12 @@ def _real_extract(self, url): 'height': height, }) - mpd_url = data.get('urlDash') + mpd_url = None if data.get('isLive') else data.get('urlDash') if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): - formats.extend(self._extract_mpd_formats( - mpd_url, media_id, mpd_id='dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + mpd_url, media_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) audio_url = data.get('urlAudio') if audio_url: @@ -332,7 +352,6 @@ def _real_extract(self, url): 'vcodec': 'none', }) - subtitles = {} for track in (data.get('tracks') or {}).values(): sub_url = track.get('url') if not sub_url: @@ -342,7 +361,7 @@ def _real_extract(self, url): }) if not formats: - fmts, subs = self._get_formats_and_subtitles(url, media_id) + fmts, subs = self._get_formats_and_subtitles(url, f'live_{media_id}' if is_live else media_id) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) From 7d3b98be4c4567b985ba7d7b17057e930457edc9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 18 Aug 2022 20:57:46 +0000 Subject: [PATCH 061/284] [extractor/instagram] Fix extraction (#4696) Closes #4657, #4532, #4475 Authored by: bashonly, pritam20ps05 --- yt_dlp/extractor/instagram.py | 168 ++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 94db756403..1d8e79495c 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -39,37 +39,42 @@ class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' _IS_LOGGED_IN = False + _API_BASE_URL = 'https://i.instagram.com/api/v1' + _LOGIN_URL = 'https://www.instagram.com/accounts/login' + _API_HEADERS = { + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', + } + def _perform_login(self, username, password): if self._IS_LOGGED_IN: return login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') + self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) + shared_data = self._parse_json(self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) + login = self._download_json( + f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ + **self._API_HEADERS, + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) if not login.get('authenticated'): if login.get('message'): @@ -134,7 +139,7 @@ def _extract_nodes(self, nodes, is_direct=False): } def _extract_product_media(self, product_media): - media_id = product_media.get('code') or product_media.get('id') + media_id = product_media.get('code') or _pk_to_id(product_media.get('pk')) vcodec = product_media.get('video_codec') dash_manifest_raw = product_media.get('video_dash_manifest') videos_list = product_media.get('video_versions') @@ -179,7 +184,7 @@ def _extract_product(self, product_info): user_info = product_info.get('user') or {} info_dict = { - 'id': product_info.get('code') or product_info.get('id'), + 'id': product_info.get('code') or _pk_to_id(product_info.get('pk')), 'title': product_info.get('title') or f'Video by {user_info.get("username")}', 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), 'timestamp': int_or_none(product_info.get('taken_at')), @@ -360,49 +365,74 @@ def _extract_embed_urls(cls, url, webpage): def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') - general_info = self._download_json( - f'https://www.instagram.com/graphql/query/?query_hash=9f8827793ef34641b2fb195d4d41151c' - f'&variables=%7B"shortcode":"{video_id}",' - '"parent_comment_count":10,"has_threaded_comments":true}', video_id, fatal=False, errnote=False, - headers={ - 'Accept': '*', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', - 'Authority': 'www.instagram.com', - 'Referer': 'https://www.instagram.com', - 'x-ig-app-id': '936619743392459', - }) - media = traverse_obj(general_info, ('data', 'shortcode_media')) or {} + media, webpage = {}, '' + + api_check = self._download_json( + f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', + video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} + csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') + + if not csrf_token: + self.report_warning('No csrf token set by Instagram API', video_id) + elif api_check.get('status') != 'ok': + self.report_warning('Instagram API is not granting access', video_id) + else: + if self._get_cookies(url).get('sessionid'): + media = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, + fatal=False, note='Downloading video info', headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token.value, + }), ('items', 0)) + if media: + return self._extract_product(media) + + variables = { + 'shortcode': video_id, + 'child_comment_count': 3, + 'fetch_comment_count': 40, + 'parent_comment_count': 24, + 'has_threaded_comments': True, + } + general_info = self._download_json( + 'https://www.instagram.com/graphql/query/', video_id, fatal=False, + headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token.value, + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }, query={ + 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'variables': json.dumps(variables, separators=(',', ':')), + }) + media = traverse_obj(general_info, ('data', 'shortcode_media')) + if not media: - self.report_warning('General metadata extraction failed', video_id) + self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + shared_data = self._search_json( + r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) - info = self._download_json( - f'https://i.instagram.com/api/v1/media/{_id_to_pk(video_id)}/info/', video_id, - fatal=False, note='Downloading video info', errnote=False, headers={ - 'Accept': '*', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', - 'Authority': 'www.instagram.com', - 'Referer': 'https://www.instagram.com', - 'x-ig-app-id': '936619743392459', - }) - if info: - media.update(info['items'][0]) - return self._extract_product(media) + if self._LOGIN_URL not in urlh.geturl(): + media.update(traverse_obj( + shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) + else: + self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage') + webpage = self._download_webpage( + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + additional_data = self._search_json( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) + if not additional_data: + self.raise_login_required('Requested content was not found, the content might be private') - webpage = self._download_webpage( - f'https://www.instagram.com/p/{video_id}/embed/', video_id, - note='Downloading embed webpage', fatal=False) - if not webpage: - self.raise_login_required('Requested content was not found, the content might be private') + product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) + if product_item: + media.update(product_item) + return self._extract_product(media) - additional_data = self._search_json( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) - product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) - if product_item: - media.update(product_item) - return self._extract_product(media) - - media.update(traverse_obj( - additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) + media.update(traverse_obj( + additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) username = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) @@ -649,12 +679,8 @@ def _real_extract(self, url): story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' videos = traverse_obj(self._download_json( - f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', - story_id, errnote=False, fatal=False, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - }), 'reels') + f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', + story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') if not videos: self.raise_login_required('You need to log in to access this content') From 4d37d4a77c50c326b273efbaed5afa1c45771474 Mon Sep 17 00:00:00 2001 From: nixxo <nixxo@protonmail.com> Date: Thu, 18 Aug 2022 22:58:59 +0200 Subject: [PATCH 062/284] [extractor/rai] Minor fix (#4700) Closes #4691, #4690 --- yt_dlp/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index dc911069dc..6ed8227eb6 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -156,7 +156,7 @@ def get_format_info(tbr): br = int_or_none(tbr) if len(fmts) == 1 and not br: br = fmts[0].get('tbr') - if br or 0 > 300: + if br and br > 300: tbr = compat_str(math.floor(br / 100) * 100) else: tbr = '250' From 8a3da4c68c1bf50ba69af10ea7855e2f7a2b38b4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 18 Aug 2022 22:15:49 +0000 Subject: [PATCH 063/284] [extractor/instagram] Fix bugs in 7d3b98be4c4567b985ba7d7b17057e930457edc9 (#4701) Authored by: bashonly --- yt_dlp/extractor/instagram.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 1d8e79495c..e997a3fbb7 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -378,12 +378,12 @@ def _real_extract(self, url): self.report_warning('Instagram API is not granting access', video_id) else: if self._get_cookies(url).get('sessionid'): - media = traverse_obj(self._download_json( + media.update(traverse_obj(self._download_json( f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, fatal=False, note='Downloading video info', headers={ **self._API_HEADERS, 'X-CSRFToken': csrf_token.value, - }), ('items', 0)) + }), ('items', 0)) or {}) if media: return self._extract_product(media) @@ -405,15 +405,15 @@ def _real_extract(self, url): 'query_hash': '9f8827793ef34641b2fb195d4d41151c', 'variables': json.dumps(variables, separators=(',', ':')), }) - media = traverse_obj(general_info, ('data', 'shortcode_media')) + media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) if not media: self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) webpage, urlh = self._download_webpage_handle(url, video_id) shared_data = self._search_json( - r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) + r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} - if self._LOGIN_URL not in urlh.geturl(): + if shared_data and self._LOGIN_URL not in urlh.geturl(): media.update(traverse_obj( shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) @@ -424,7 +424,7 @@ def _real_extract(self, url): additional_data = self._search_json( r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) if not additional_data: - self.raise_login_required('Requested content was not found, the content might be private') + self.raise_login_required('Requested content is not available, rate-limit reached or login required') product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) if product_item: From be13a6e525a05f97dffd6ee0798145132f14be3a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 19 Aug 2022 03:46:16 +0530 Subject: [PATCH 064/284] [jsinterp] Bring on-par with youtube-dl Code from: https://github.com/ytdl-org/youtube-dl/pull/31175, https://github.com/ytdl-org/youtube-dl/pull/31182 Authored by pukkandan, dirkf --- test/test_jsinterp.py | 120 ++++++++++++++++++++- yt_dlp/jsinterp.py | 236 +++++++++++++++++++++++++++++++----------- 2 files changed, 295 insertions(+), 61 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 665af4668a..863e52458b 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -7,8 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import math +import re -from yt_dlp.jsinterp import JSInterpreter +from yt_dlp.jsinterp import JS_Undefined, JSInterpreter class TestJSInterpreter(unittest.TestCase): @@ -66,6 +68,9 @@ def test_operators(self): jsi = JSInterpreter('function f(){return 0 && 1 || 2;}') self.assertEqual(jsi.call_function('f'), 2) + jsi = JSInterpreter('function f(){return 0 ?? 42;}') + self.assertEqual(jsi.call_function('f'), 0) + def test_array_access(self): jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) @@ -229,6 +234,119 @@ def test_return_function(self): ''') self.assertEqual(jsi.call_function('x')([]), 1) + def test_null(self): + jsi = JSInterpreter(''' + function x() { return null; } + ''') + self.assertEqual(jsi.call_function('x'), None) + + jsi = JSInterpreter(''' + function x() { return [null > 0, null < 0, null == 0, null === 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, False, False]) + + jsi = JSInterpreter(''' + function x() { return [null >= 0, null <= 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [True, True]) + + def test_undefined(self): + jsi = JSInterpreter(''' + function x() { return undefined === undefined; } + ''') + self.assertEqual(jsi.call_function('x'), True) + + jsi = JSInterpreter(''' + function x() { return undefined; } + ''') + self.assertEqual(jsi.call_function('x'), JS_Undefined) + + jsi = JSInterpreter(''' + function x() { let v; return v; } + ''') + self.assertEqual(jsi.call_function('x'), JS_Undefined) + + jsi = JSInterpreter(''' + function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } + ''') + self.assertEqual(jsi.call_function('x'), [True, True, False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined === 0, undefined == 0, undefined < 0, undefined > 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined >= 0, undefined <= 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined > null, undefined < null, undefined == null, undefined === null]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, True, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined === null, undefined == null, undefined < null, undefined > null]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, True, False, False]) + + jsi = JSInterpreter(''' + function x() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } + ''') + for y in jsi.call_function('x'): + self.assertTrue(math.isnan(y)) + + jsi = JSInterpreter(''' + function x() { let v; return v**0; } + ''') + self.assertEqual(jsi.call_function('x'), 1) + + jsi = JSInterpreter(''' + function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined]) + + jsi = JSInterpreter('function x(){return undefined ?? 42; }') + self.assertEqual(jsi.call_function('x'), 42) + + def test_object(self): + jsi = JSInterpreter(''' + function x() { return {}; } + ''') + self.assertEqual(jsi.call_function('x'), {}) + + jsi = JSInterpreter(''' + function x() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; } + ''') + self.assertEqual(jsi.call_function('x'), [42, 0]) + + jsi = JSInterpreter(''' + function x() { let a; return a?.qq; } + ''') + self.assertEqual(jsi.call_function('x'), JS_Undefined) + + jsi = JSInterpreter(''' + function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } + ''') + self.assertEqual(jsi.call_function('x'), JS_Undefined) + + def test_regex(self): + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/; } + ''') + self.assertEqual(jsi.call_function('x'), None) + + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/; return a; } + ''') + self.assertIsInstance(jsi.call_function('x'), re.Pattern) + + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/i; return a; } + ''') + self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index d3994e90c2..2b68f53fae 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -16,50 +16,69 @@ write_string, ) -_NAME_RE = r'[a-zA-Z_$][\w$]*' -# Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence -_OPERATORS = { # None => Defined in JSInterpreter._operator - '?': None, +def _js_bit_op(op): + def wrapped(a, b): + def zeroise(x): + return 0 if x in (None, JS_Undefined) else x + return op(zeroise(a), zeroise(b)) - '||': None, - '&&': None, - '&': lambda a, b: (a or 0) & (b or 0), - '|': lambda a, b: (a or 0) | (b or 0), - '^': lambda a, b: (a or 0) ^ (b or 0), - - '===': operator.is_, - '!==': operator.is_not, - '==': operator.eq, - '!=': operator.ne, - - '<=': lambda a, b: (a or 0) <= (b or 0), - '>=': lambda a, b: (a or 0) >= (b or 0), - '<': lambda a, b: (a or 0) < (b or 0), - '>': lambda a, b: (a or 0) > (b or 0), - - '>>': operator.rshift, - '<<': operator.lshift, - - '+': lambda a, b: (a or 0) + (b or 0), - '-': lambda a, b: (a or 0) - (b or 0), - - '*': lambda a, b: (a or 0) * (b or 0), - '/': lambda a, b: (a or 0) / b if b else float('NaN'), - '%': lambda a, b: (a or 0) % b if b else float('NaN'), - - '**': operator.pow, -} - -_COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} - -_MATCHING_PARENS = dict(zip('({[', ')}]')) -_QUOTES = '\'"/' + return wrapped -def _ternary(cndn, if_true=True, if_false=False): +def _js_arith_op(op): + + def wrapped(a, b): + if JS_Undefined in (a, b): + return float('nan') + return op(a or 0, b or 0) + + return wrapped + + +def _js_div(a, b): + if JS_Undefined in (a, b) or not (a and b): + return float('nan') + return (a or 0) / b if b else float('inf') + + +def _js_mod(a, b): + if JS_Undefined in (a, b) or not b: + return float('nan') + return (a or 0) % b + + +def _js_exp(a, b): + if not b: + return 1 # even 0 ** 0 !! + elif JS_Undefined in (a, b): + return float('nan') + return (a or 0) ** b + + +def _js_eq_op(op): + + def wrapped(a, b): + if {a, b} <= {None, JS_Undefined}: + return op(a, a) + return op(a, b) + + return wrapped + + +def _js_comp_op(op): + + def wrapped(a, b): + if JS_Undefined in (a, b): + return False + return op(a or 0, b or 0) + + return wrapped + + +def _js_ternary(cndn, if_true=True, if_false=False): """Simulate JS's ternary operator (cndn?if_true:if_false)""" - if cndn in (False, None, 0, ''): + if cndn in (False, None, 0, '', JS_Undefined): return if_false with contextlib.suppress(TypeError): if math.isnan(cndn): # NB: NaN cannot be checked by membership @@ -67,6 +86,50 @@ def _ternary(cndn, if_true=True, if_false=False): return if_true +# Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence +_OPERATORS = { # None => Defined in JSInterpreter._operator + '?': None, + '??': None, + '||': None, + '&&': None, + + '|': _js_bit_op(operator.or_), + '^': _js_bit_op(operator.xor), + '&': _js_bit_op(operator.and_), + + '===': operator.is_, + '==': _js_eq_op(operator.eq), + '!==': operator.is_not, + '!=': _js_eq_op(operator.ne), + + '<=': _js_comp_op(operator.le), + '>=': _js_comp_op(operator.ge), + '<': _js_comp_op(operator.lt), + '>': _js_comp_op(operator.gt), + + '>>': _js_bit_op(operator.rshift), + '<<': _js_bit_op(operator.lshift), + + '+': _js_arith_op(operator.add), + '-': _js_arith_op(operator.sub), + + '*': _js_arith_op(operator.mul), + '/': _js_div, + '%': _js_mod, + '**': _js_exp, +} + +_COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} + +_NAME_RE = r'[a-zA-Z_$][\w$]*' +_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) +_QUOTES = '\'"/' + + +class JS_Undefined: + pass + + class JS_Break(ExtractorError): def __init__(self): ExtractorError.__init__(self, 'Invalid break') @@ -119,6 +182,21 @@ def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs class JSInterpreter: __named_object_counter = 0 + _RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + _EXC_NAME = '__yt_dlp_exception__' + def __init__(self, code, objects=None): self.code, self._functions = code, {} self._objects = {} if objects is None else objects @@ -135,6 +213,17 @@ def _named_object(self, namespace, obj): namespace[name] = obj return name + @classmethod + def _regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls._RE_FLAGS: + break + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] + @staticmethod def _separate(expr, delim=',', max_split=None): OP_CHARS = '+-*/%&|^=<>!,;' @@ -178,10 +267,13 @@ def _separate_at_paren(cls, expr, delim): def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): if op in ('||', '&&'): - if (op == '&&') ^ _ternary(left_val): + if (op == '&&') ^ _js_ternary(left_val): return left_val # short circuiting + elif op == '??': + if left_val not in (None, JS_Undefined): + return left_val elif op == '?': - right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) + right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1)) right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) if not _OPERATORS.get(op): @@ -192,12 +284,14 @@ def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion) except Exception as e: raise self.Exception(f'Failed to evaluate {left_val!r} {op} {right_val!r}', expr, cause=e) - def _index(self, obj, idx): + def _index(self, obj, idx, allow_undefined=False): if idx == 'length': return len(obj) try: return obj[int(idx)] if isinstance(obj, list) else obj[idx] except Exception as e: + if allow_undefined: + return JS_Undefined raise self.Exception(f'Cannot get index {idx}', repr(obj), cause=e) def _dump(self, obj, namespace): @@ -233,8 +327,8 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': - inner = inner[1:].replace('"', R'\"') - inner = re.compile(json.loads(js_to_json(f'"{inner}"', strict=True))) + flags, outer = self._regex_flags(outer) + inner = re.compile(inner[1:], flags=flags) else: inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) if not outer: @@ -259,6 +353,17 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if expr.startswith('{'): inner, outer = self._separate_at_paren(expr, '}') + # Look for Map first + sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] + if all(len(sub_expr) == 2 for sub_expr in sub_expressions): + def dict_item(key, val): + val = self.interpret_expression(val, local_vars, allow_recursion) + if re.match(_NAME_RE, key): + return key, val + return self.interpret_expression(key, local_vars, allow_recursion), val + + return dict(dict_item(k, v) for k, v in sub_expressions), should_return + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: return inner, should_abort or should_return @@ -295,17 +400,17 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if should_abort: return ret, True except JS_Throw as e: - local_vars['__ytdlp_exception__'] = e.error + local_vars[self._EXC_NAME] = e.error except Exception as e: # XXX: This works for now, but makes debugging future issues very hard - local_vars['__ytdlp_exception__'] = e + local_vars[self._EXC_NAME] = e ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return elif m and m.group('catch'): catch_expr, expr = self._separate_at_paren(expr[m.end():], '}') - if '__ytdlp_exception__' in local_vars: - catch_vars = local_vars.new_child({m.group('err'): local_vars.pop('__ytdlp_exception__')}) + if self._EXC_NAME in local_vars: + catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)}) ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion) if should_abort: return ret, True @@ -328,7 +433,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): start, cndn, increment = self._separate(constructor, ';') self.interpret_expression(start, local_vars, allow_recursion) while True: - if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): + if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): break try: ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) @@ -397,13 +502,13 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): (?P<assign> (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* (?P<op>{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? - =(?P<expr>.*)$ + =(?!=)(?P<expr>.*)$ )|(?P<return> (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ )|(?P<indexing> (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ )|(?P<attribute> - (?P<var>{_NAME_RE})(?:\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* + (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* )|(?P<function> (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ )''', expr) @@ -414,7 +519,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): local_vars[m.group('out')] = self._operator( m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) return local_vars[m.group('out')], should_return - elif left_val is None: + elif left_val in (None, JS_Undefined): raise self.Exception(f'Cannot index undefined variable {m.group("out")}', expr) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) @@ -432,9 +537,11 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): raise JS_Break() elif expr == 'continue': raise JS_Continue() + elif expr == 'undefined': + return JS_Undefined, should_return elif m and m.group('return'): - return local_vars[m.group('name')], should_return + return local_vars.get(m.group('name'), JS_Undefined), should_return with contextlib.suppress(ValueError): return json.loads(js_to_json(expr, strict=True)), should_return @@ -447,8 +554,11 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): for op in _OPERATORS: separated = list(self._separate(expr, op)) right_expr = separated.pop() - while op in '<>*-' and len(separated) > 1 and not separated[-1].strip(): - separated.pop() + while True: + if op in '?<>*-' and len(separated) > 1 and not separated[-1].strip(): + separated.pop() + elif not (separated and op == '?' and right_expr.startswith('.')): + break right_expr = f'{op}{right_expr}' if op != '-': right_expr = f'{separated.pop()}{op}{right_expr}' @@ -458,8 +568,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return if m and m.group('attribute'): - variable = m.group('var') - member = m.group('member') + variable, member, nullish = m.group('var', 'member', 'nullish') if not member: member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] @@ -486,12 +595,19 @@ def eval_method(): obj = local_vars.get(variable, types.get(variable, NO_DEFAULT)) if obj is NO_DEFAULT: if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] + try: + self._objects[variable] = self.extract_object(variable) + except self.Exception: + if not nullish: + raise + obj = self._objects.get(variable, JS_Undefined) + + if nullish and obj is JS_Undefined: + return JS_Undefined # Member access if arg_str is None: - return self._index(obj, member) + return self._index(obj, member, nullish) # Function call argvals = [ From a831c2ea9041557fdcd4abed0a449ef7bbca13e2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 19 Aug 2022 04:58:54 +0530 Subject: [PATCH 065/284] [cleanup] Misc --- Changelog.md | 4 ++-- README.md | 2 +- yt_dlp/YoutubeDL.py | 1 + yt_dlp/extractor/youtube.py | 2 +- yt_dlp/extractor/zattoo.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Changelog.md b/Changelog.md index 7d16b8a8fa..304a23eafd 100644 --- a/Changelog.md +++ b/Changelog.md @@ -19,8 +19,7 @@ ### 2022.08.14 * [extractor] Fix format sorting of `channels` * [ffmpeg] Disable avconv unless `--prefer-avconv` * [ffmpeg] Smarter detection of ffprobe filename -* [patreon] Ignore erroneous media attachments by [coletdjnz](https://github.com/coletdjnz) -* [postprocessor/embedthumbnail] Detect `libatomicparsley.so` +* [embedthumbnail] Detect `libatomicparsley.so` * [ThumbnailsConvertor] Fix conversion after `fixup_webp` * [utils] Fix `get_compatible_ext` * [build] Fix changelog @@ -30,6 +29,7 @@ ### 2022.08.14 * [cleanup] Misc fixes and cleanup * [extractor/moview] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) * [extractor/parler] Add extractor by [palewire](https://github.com/palewire) +* [extractor/patreon] Ignore erroneous media attachments by [coletdjnz](https://github.com/coletdjnz) * [extractor/truth] Add extractor by [palewire](https://github.com/palewire) * [extractor/aenetworks] Add formats parameter by [jacobtruman](https://github.com/jacobtruman) * [extractor/crunchyroll] Improve `_VALID_URL`s diff --git a/README.md b/README.md index 31793b54e0..9db6939946 100644 --- a/README.md +++ b/README.md @@ -329,7 +329,7 @@ ### Platform-independent Binary (UNIX) After installing these, simply run `make`. -You can also run `make yt-dlp` instead to compile only the binary without updating any of the additional files. (The dependencies marked with **\*** are not needed for this) +You can also run `make yt-dlp` instead to compile only the binary without updating any of the additional files. (The build tools marked with **\*** are not needed for this) ### Standalone Py2Exe Builds (Windows) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7f6dc6027b..c2b306d70c 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -444,6 +444,7 @@ class YoutubeDL: * index: Section number (Optional) force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts noprogress: Do not print the progress bar + live_from_start: Whether to download livestreams videos from the start The following parameters are not used by YoutubeDL itself, they are used by the downloader (see yt_dlp/downloader/common.py): diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c624d8c8c0..fd62d716a2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -868,7 +868,7 @@ def _extract_video(self, renderer): else None), 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() - else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges + else 'is_live' if overlay_style == 'LIVE' or 'live now' in badges else None), 'release_timestamp': scheduled_timestamp, 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 1e38812aad..572a1d0f2a 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -320,7 +320,7 @@ class ZattooRecordingsIE(ZattooBaseIE): class NetPlusTVBaseIE(ZattooPlatformBaseIE): - _NETRC = 'netplus' + _NETRC_MACHINE = 'netplus' _HOST = 'netplus.tv' _API_HOST = 'www.%s' % _HOST From 48c88e088cca179ab8d0b39b8ca5e25fd54244f1 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 19 Aug 2022 05:08:10 +0530 Subject: [PATCH 066/284] Release 2022.08.19 --- Changelog.md | 17 +++++++++++++++++ README.md | 2 +- supportedsites.md | 26 +++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Changelog.md b/Changelog.md index 304a23eafd..5d72db7d0b 100644 --- a/Changelog.md +++ b/Changelog.md @@ -11,6 +11,23 @@ # Instuctions for creating release --> +### 2022.08.19 + +* Fix bug in `--download-archive` +* [jsinterp] **Fix for new youtube players** and related improvements by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan) +* [phantomjs] Add function to execute JS without a DOM by [MinePlayersPE](https://github.com/MinePlayersPE), [pukkandan](https://github.com/pukkandan) +* [build] Exclude devscripts from installs by [Lesmiscore](https://github.com/Lesmiscore) +* [cleanup] Misc fixes and cleanup +* [extractor/youtube] **Add fallback to phantomjs** for nsig +* [extractor/youtube] Fix error reporting of "Incomplete data" +* [extractor/youtube] Improve format sorting for IOS formats +* [extractor/youtube] Improve signature caching +* [extractor/instagram] Fix extraction by [bashonly](https://github.com/bashonly), [pritam20ps05](https://github.com/pritam20ps05) +* [extractor/rai] Minor fix by [nixxo](https://github.com/nixxo) +* [extractor/rtbf] Fix stream extractor by [elyse0](https://github.com/elyse0) +* [extractor/SovietsCloset] Fix extractor by [ChillingPepper](https://github.com/ChillingPepper) +* [extractor/zattoo] Fix Zattoo resellers by [goggle](https://github.com/goggle) + ### 2022.08.14 * Merge youtube-dl: Upto [commit/d231b56](https://github.com/ytdl-org/youtube-dl/commit/d231b56) diff --git a/README.md b/README.md index 9db6939946..7cfeec4f12 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/d231b56](https://github.com/ytdl-org/youtube-dl/commit/d231b56717c73ee597d2e077d11b69ed48a1b02d)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17+ [commit/b0a60ce](https://github.com/ytdl-org/youtube-dl/commit/b0a60ce2032172aeaaf27fe3866ab72768f10cb2)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/supportedsites.md b/supportedsites.md index aa1d52b5b3..c115c00e36 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -128,6 +128,8 @@ # Supported sites - **bbc.co.uk:iplayer:group** - **bbc.co.uk:playlist** - **BBVTV**: [<abbr title="netrc machine"><em>bbvtv</em></abbr>] + - **BBVTVLive**: [<abbr title="netrc machine"><em>bbvtv</em></abbr>] + - **BBVTVRecordings**: [<abbr title="netrc machine"><em>bbvtv</em></abbr>] - **Beatport** - **Beeg** - **BehindKink** @@ -348,6 +350,8 @@ # Supported sites - **ehftv** - **eHow** - **EinsUndEinsTV**: [<abbr title="netrc machine"><em>1und1tv</em></abbr>] + - **EinsUndEinsTVLive**: [<abbr title="netrc machine"><em>1und1tv</em></abbr>] + - **EinsUndEinsTVRecordings**: [<abbr title="netrc machine"><em>1und1tv</em></abbr>] - **Einthusan** - **eitb.tv** - **EllenTube** @@ -375,6 +379,8 @@ # Supported sites - **EuropeanTour** - **EUScreen** - **EWETV**: [<abbr title="netrc machine"><em>ewetv</em></abbr>] + - **EWETVLive**: [<abbr title="netrc machine"><em>ewetv</em></abbr>] + - **EWETVRecordings**: [<abbr title="netrc machine"><em>ewetv</em></abbr>] - **ExpoTV** - **Expressen** - **ExtremeTube** @@ -454,6 +460,8 @@ # Supported sites - **GiantBomb** - **Giga** - **GlattvisionTV**: [<abbr title="netrc machine"><em>glattvisiontv</em></abbr>] + - **GlattvisionTVLive**: [<abbr title="netrc machine"><em>glattvisiontv</em></abbr>] + - **GlattvisionTVRecordings**: [<abbr title="netrc machine"><em>glattvisiontv</em></abbr>] - **Glide**: Glide mobile video messages (glide.me) - **Globo**: [<abbr title="netrc machine"><em>globo</em></abbr>] - **GloboArticle** @@ -715,6 +723,8 @@ # Supported sites - **MLSSoccer** - **Mnet** - **MNetTV**: [<abbr title="netrc machine"><em>mnettv</em></abbr>] + - **MNetTVLive**: [<abbr title="netrc machine"><em>mnettv</em></abbr>] + - **MNetTVRecordings**: [<abbr title="netrc machine"><em>mnettv</em></abbr>] - **MochaVideo** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** @@ -801,7 +811,9 @@ # Supported sites - **netease:program**: 网易云音乐 - 电台节目 - **netease:singer**: 网易云音乐 - 歌手 - **netease:song**: 网易云音乐 - - **NetPlus**: [<abbr title="netrc machine"><em>netplus</em></abbr>] + - **NetPlusTV**: [<abbr title="netrc machine"><em>netplus</em></abbr>] + - **NetPlusTVLive**: [<abbr title="netrc machine"><em>netplus</em></abbr>] + - **NetPlusTVRecordings**: [<abbr title="netrc machine"><em>netplus</em></abbr>] - **Netverse** - **NetversePlaylist** - **Netzkino** @@ -906,6 +918,8 @@ # Supported sites - **orf:radio** - **orf:tvthek**: ORF TVthek - **OsnatelTV**: [<abbr title="netrc machine"><em>osnateltv</em></abbr>] + - **OsnatelTVLive**: [<abbr title="netrc machine"><em>osnateltv</em></abbr>] + - **OsnatelTVRecordings**: [<abbr title="netrc machine"><em>osnateltv</em></abbr>] - **OutsideTV** - **PacktPub**: [<abbr title="netrc machine"><em>packtpub</em></abbr>] - **PacktPubCourse** @@ -1013,6 +1027,8 @@ # Supported sites - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuantumTV**: [<abbr title="netrc machine"><em>quantumtv</em></abbr>] + - **QuantumTVLive**: [<abbr title="netrc machine"><em>quantumtv</em></abbr>] + - **QuantumTVRecordings**: [<abbr title="netrc machine"><em>quantumtv</em></abbr>] - **Qub** - **R7** - **R7Article** @@ -1121,7 +1137,11 @@ # Supported sites - **safari:course**: [<abbr title="netrc machine"><em>safari</em></abbr>] safaribooksonline.com online courses - **Saitosan** - **SAKTV**: [<abbr title="netrc machine"><em>saktv</em></abbr>] + - **SAKTVLive**: [<abbr title="netrc machine"><em>saktv</em></abbr>] + - **SAKTVRecordings**: [<abbr title="netrc machine"><em>saktv</em></abbr>] - **SaltTV**: [<abbr title="netrc machine"><em>salttv</em></abbr>] + - **SaltTVLive**: [<abbr title="netrc machine"><em>salttv</em></abbr>] + - **SaltTVRecordings**: [<abbr title="netrc machine"><em>salttv</em></abbr>] - **SampleFocus** - **Sapo**: SAPO Vídeos - **savefrom.net** @@ -1494,6 +1514,8 @@ # Supported sites - **VShare** - **VTM** - **VTXTV**: [<abbr title="netrc machine"><em>vtxtv</em></abbr>] + - **VTXTVLive**: [<abbr title="netrc machine"><em>vtxtv</em></abbr>] + - **VTXTVRecordings**: [<abbr title="netrc machine"><em>vtxtv</em></abbr>] - **VuClip** - **Vupload** - **VVVVID** @@ -1503,6 +1525,8 @@ # Supported sites - **Wakanim** - **Walla** - **WalyTV**: [<abbr title="netrc machine"><em>walytv</em></abbr>] + - **WalyTVLive**: [<abbr title="netrc machine"><em>walytv</em></abbr>] + - **WalyTVRecordings**: [<abbr title="netrc machine"><em>walytv</em></abbr>] - **wasdtv:clip** - **wasdtv:record** - **wasdtv:stream** From b76e9cedb33d23f21060281596f7443750f67758 Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Fri, 19 Aug 2022 00:11:11 +0000 Subject: [PATCH 067/284] [version] update Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++++---- .github/ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++++---- yt_dlp/version.py | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 5c54d3c5e4..6f03f6e585 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -62,7 +62,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -70,8 +70,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.14, Current version: 2022.08.14 - yt-dlp is up to date (2022.08.14) + Latest version: 2022.08.19, Current version: 2022.08.19 + yt-dlp is up to date (2022.08.19) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 89d59b6f1b..7904889a5b 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -74,7 +74,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -82,8 +82,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.14, Current version: 2022.08.14 - yt-dlp is up to date (2022.08.14) + Latest version: 2022.08.19, Current version: 2022.08.19 + yt-dlp is up to date (2022.08.19) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index b2fb774fee..7d1f337322 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -70,7 +70,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -78,8 +78,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.14, Current version: 2022.08.14 - yt-dlp is up to date (2022.08.14) + Latest version: 2022.08.19, Current version: 2022.08.19 + yt-dlp is up to date (2022.08.19) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index f30c2cb90f..da68f4517a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -55,7 +55,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -63,8 +63,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.14, Current version: 2022.08.14 - yt-dlp is up to date (2022.08.14) + Latest version: 2022.08.19, Current version: 2022.08.19 + yt-dlp is up to date (2022.08.19) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 3f955bd0b5..4fbda845fb 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -51,7 +51,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -59,7 +59,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.14, Current version: 2022.08.14 - yt-dlp is up to date (2022.08.14) + Latest version: 2022.08.19, Current version: 2022.08.19 + yt-dlp is up to date (2022.08.19) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 20e3050331..c51ed1b9cc 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.08.14** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.14 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,7 +65,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.14, Current version: 2022.08.14 - yt-dlp is up to date (2022.08.14) + Latest version: 2022.08.19, Current version: 2022.08.19 + yt-dlp is up to date (2022.08.19) <more lines> render: shell diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 9786ee978f..45f670b091 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.08.14' +__version__ = '2022.08.19' -RELEASE_GIT_HEAD = '55937202b' +RELEASE_GIT_HEAD = '48c88e088' VARIANT = None From 1704c47ba81dfa6de1b57c1c639863aad37390eb Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 20 Aug 2022 04:52:25 +0530 Subject: [PATCH 068/284] [extractor/bitchute] Mark errors as expected Closes #4685 --- yt_dlp/extractor/bitchute.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 24d321566a..c9cbb6d1db 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -65,10 +65,12 @@ def _real_extract(self, url): error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video') if error == 'Video Unavailable': raise GeoRestrictedError(error) - raise ExtractorError(error) + raise ExtractorError(error, expected=True) formats = entries[0]['formats'] self._check_formats(formats, video_id) + if not formats: + raise self.raise_no_formats('Video is unavailable', expected=True, video_id=video_id) self._sort_formats(formats) description = self._html_search_regex( From 0a6b4b82e926ffd583a5cbe81d25bbfc7f1f43ed Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 20 Aug 2022 05:00:45 +0530 Subject: [PATCH 069/284] [extractor/uktv] Improve _VALID_URL Closes #4707 Authored by: dirkf --- yt_dlp/extractor/uktvplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/uktvplay.py b/yt_dlp/extractor/uktvplay.py index abea07ab56..819ac5a35a 100644 --- a/yt_dlp/extractor/uktvplay.py +++ b/yt_dlp/extractor/uktvplay.py @@ -2,7 +2,7 @@ class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)' + _VALID_URL = r'https?://uktvplay\.(?:uktv\.)?co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)' _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', 'info_dict': { From 90a1df305b628c78a497cf4010fb68cad856a314 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 21 Aug 2022 00:51:03 +0530 Subject: [PATCH 070/284] [test] Fix test_youtube_signature --- test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 21 ++++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index f1859a2fc6..4b526ff2e2 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -110,6 +110,10 @@ 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', ), + ( + 'https://www.youtube.com/s/player/009f1d77/player_ias.vflset/en_US/base.js', + '5dwFHw8aFWQUQtffRq', 'audescmLUzI3jw', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index fd62d716a2..59449278de 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2646,6 +2646,17 @@ def _decrypt_nsig(self, s, video_id, player_url): self.write_debug(f'Decrypted nsig {s} => {ret}') return ret + def _extract_n_function_name(self, jscode): + funcname, idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', + jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) + if not idx: + return funcname + + return json.loads(js_to_json(self._search_regex( + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, + f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] + def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) func_code = self.cache.load('youtube-nsig', player_id) @@ -2655,15 +2666,7 @@ def _extract_n_function_code(self, video_id, player_url): if func_code: return jsi, player_id, func_code - funcname, idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', - jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) - if idx: - funcname = json.loads(js_to_json(self._search_regex( - rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, - f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - - func_code = jsi.extract_function_code(funcname) + func_code = jsi.extract_function_code(self._extract_n_function_name(jscode)) self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code From b25cac650f3cbba16f46c64b0f9b0a96a9171fbc Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 21 Aug 2022 00:56:27 +0530 Subject: [PATCH 071/284] [extractor/youtube] Fix bug in format sorting --- yt_dlp/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 59449278de..5a19b591a1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3199,7 +3199,7 @@ def append_client(*client_names): def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration): itags, stream_ids = {}, [] - itag_qualities, res_qualities = {}, {0: -1} + itag_qualities, res_qualities = {}, {0: None} q = qualities([ # Normally tiny is the smallest video-only formats. But # audio-only formats with unknown quality may get tagged as tiny @@ -3357,7 +3357,7 @@ def process_manifest_format(f, proto, itag): f['format_id'] = itag itags[itag] = proto - f['quality'] = itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1) + f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) return True From 2d1019542af1f13a9c287969d0f2569570320872 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 21 Aug 2022 05:17:22 +0530 Subject: [PATCH 072/284] [extractor/BiliBiliSearch] Fix infinite loop Closes #4682 --- yt_dlp/extractor/bilibili.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 4315315083..9467f5f82b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -627,7 +627,9 @@ def _search_results(self, query): 'search_type': 'video', 'tids': 0, 'highlight': 1, - })['data'].get('result') or [] + })['data'].get('result') + if not videos: + break for video in videos: yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid'])) From 8d1ad6378fb52ce48a957d90bc28127ee986b6f4 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 21 Aug 2022 05:18:12 +0530 Subject: [PATCH 073/284] [extractor/BiliBiliSearch] Don't sort by date Related #4682 --- yt_dlp/extractor/bilibili.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 9467f5f82b..17c974d496 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -620,7 +620,6 @@ def _search_results(self, query): 'keyword': query, 'page': page_num, 'context': '', - 'order': 'pubdate', 'duration': 0, 'tids_2': '', '__refresh__': 'true', From 822d66e591341f8bf082be371b4beb66d72ba080 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 22 Aug 2022 04:37:23 +0530 Subject: [PATCH 074/284] Fix bug in `--alias` --- yt_dlp/options.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9d75c39769..6373ff8c0a 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -303,10 +303,11 @@ def _create_alias(option, opt_str, value, parser): parser.add_option_group(alias_group) aliases = (x if x.startswith('-') else f'--{x}' for x in map(str.strip, aliases.split(','))) + DEST = '_triggered_aliases' + setattr(parser.values, DEST, collections.defaultdict(int)) try: alias_group.add_option( - *aliases, help=opts, nargs=nargs, type='str' if nargs else None, - dest='_triggered_aliases', default=collections.defaultdict(int), + *aliases, help=opts, nargs=nargs, dest=DEST, type='str' if nargs else None, metavar=' '.join(f'ARG{i}' for i in range(nargs)), action='callback', callback=_alias_callback, callback_kwargs={'opts': opts, 'nargs': nargs}) except Exception as err: From 992dc6b4863d0e60f2a1ce3933f67814d8a17f8d Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 22 Aug 2022 06:19:06 +0530 Subject: [PATCH 075/284] [jsinterp] Implement timeout Workaround for #4716 --- yt_dlp/extractor/openload.py | 10 +++++++--- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/utils.py | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index e66ed4831b..4bba7bdd05 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -219,7 +219,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w return html, stdout - def execute(self, jscode, video_id=None, note='Executing JS'): + def execute(self, jscode, video_id=None, *, note='Executing JS'): """Execute JS and return stdout""" if 'phantom.exit();' not in jscode: jscode += ';\nphantom.exit();' @@ -231,8 +231,12 @@ def execute(self, jscode, video_id=None, note='Executing JS'): cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') - stdout, stderr, returncode = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000, + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) if returncode: - raise ExtractorError(f'Executing JS failed:\n{stderr.strip()}') + raise ExtractorError(f'{note} failed:\n{stderr.strip()}') return stdout diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5a19b591a1..e9f8adbd15 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2630,7 +2630,7 @@ def _decrypt_nsig(self, s, video_id, player_url): ret = extract_nsig(jsi, func_code)(s) except JSInterpreter.Exception as e: try: - jsi = PhantomJSwrapper(self) + jsi = PhantomJSwrapper(self, timeout=5000) except ExtractorError: raise e self.report_warning( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 49ee228650..13768d8469 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -860,9 +860,9 @@ def kill(self, *, timeout=0): self.wait(timeout=timeout) @classmethod - def run(cls, *args, **kwargs): + def run(cls, *args, timeout=None, **kwargs): with cls(*args, **kwargs) as proc: - stdout, stderr = proc.communicate_or_kill() + stdout, stderr = proc.communicate_or_kill(timeout=timeout) return stdout or '', stderr or '', proc.returncode From b85703d11a150967b9430f38ac938c7f41a4ad76 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Mon, 22 Aug 2022 13:45:46 -0500 Subject: [PATCH 076/284] [extractor/rtbf] Fix jwt extraction (#4738) Closes #4683 Authored by: elyse0 --- yt_dlp/extractor/redbee.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/redbee.py b/yt_dlp/extractor/redbee.py index 89a10448e1..ee510eb40f 100644 --- a/yt_dlp/extractor/redbee.py +++ b/yt_dlp/extractor/redbee.py @@ -11,6 +11,7 @@ int_or_none, strip_or_none, traverse_obj, + try_call, unified_timestamp, ) @@ -255,7 +256,7 @@ def _get_formats_and_subtitles(self, url, media_id): if not login_token: self.raise_login_required() - session_jwt = self._download_json( + session_jwt = try_call(lambda: self._get_cookies(url)['rtbf_jwt'].value) or self._download_json( 'https://login.rtbf.be/accounts.getJWT', media_id, query={ 'login_token': login_token.value, 'APIKey': self._GIGYA_API_KEY, From 07275b708b4f46c3b3fc9ea941a842fb287cad02 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <admin@xenova.com> Date: Mon, 22 Aug 2022 22:04:12 +0200 Subject: [PATCH 077/284] [extractor/medaltv] Fix extraction (#4739) Authored by: xenova --- yt_dlp/extractor/medaltv.py | 70 +++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index 5f0a9b42f6..80efcc7649 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -8,15 +8,33 @@ float_or_none, int_or_none, str_or_none, - try_get, + traverse_obj, ) class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/(?P<path>games/[^/?#&]+/clips)/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', - 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K', + 'md5': '6930f8972914b6b9fdc2bb3918098ba0', + 'info_dict': { + 'id': 'jTBFnLKdLy15K', + 'ext': 'mp4', + 'title': "Mornu's clutch", + 'description': '', + 'uploader': 'Aciel', + 'timestamp': 1651628243, + 'upload_date': '20220504', + 'uploader_id': '19335460', + 'uploader_url': 'https://medal.tv/users/19335460', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 13, + } + }, { + 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2mA60jWAGQCBH', + 'md5': '3d19d426fe0b2d91c26e412684e66a06', 'info_dict': { 'id': '2mA60jWAGQCBH', 'ext': 'mp4', @@ -26,9 +44,15 @@ class MedalTVIE(InfoExtractor): 'timestamp': 1603165266, 'upload_date': '20201020', 'uploader_id': '10619174', + 'thumbnail': 'https://cdn.medal.tv/10619174/thumbnail-34934644-720p.jpg?t=1080p&c=202042&missing', + 'uploader_url': 'https://medal.tv/users/10619174', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 23, } }, { - 'url': 'https://medal.tv/clips/2um24TWdty0NA', + 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { 'id': '2um24TWdty0NA', @@ -39,25 +63,42 @@ class MedalTVIE(InfoExtractor): 'timestamp': 1605580939, 'upload_date': '20201117', 'uploader_id': '5156321', + 'thumbnail': 'https://cdn.medal.tv/5156321/thumbnail-36787208-360p.jpg?t=1080p&c=202046&missing', + 'uploader_url': 'https://medal.tv/users/5156321', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 9, } }, { - 'url': 'https://medal.tv/clips/37rMeFpryCC-9', + 'url': 'https://medal.tv/games/valorant/clips/37rMeFpryCC-9', 'only_matching': True, }, { - 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', + 'url': 'https://medal.tv/games/valorant/clips/2WRj40tpY_EU9', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + path = self._match_valid_url(url).group('path') + webpage = self._download_webpage(url, video_id) - hydration_data = self._parse_json(self._search_regex( - r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', - webpage, 'hydration data', default='{}'), video_id) + next_data = self._search_json( + '<script[^>]*__NEXT_DATA__[^>]*>', webpage, + 'next data', video_id, end_pattern='</script>', fatal=False) - clip = try_get( - hydration_data, lambda x: x['clips'][video_id], dict) or {} + build_id = next_data.get('buildId') + if not build_id: + raise ExtractorError( + 'Could not find build ID.', video_id=video_id) + + locale = next_data.get('locale', 'en') + + api_response = self._download_json( + f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id) + + clip = traverse_obj(api_response, ('pageProps', 'clip')) or {} if not clip: raise ExtractorError( 'Could not find video information.', video_id=video_id) @@ -113,9 +154,8 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None): # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author = try_get( - hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} - author_id = str_or_none(author.get('id')) + author = traverse_obj(api_response, ('pageProps', 'profile')) or {} + author_id = str_or_none(author.get('userId')) author_url = format_field(author_id, None, 'https://medal.tv/users/%s') return { From 13db4e7b9e3932595c6b78df47ab4a0382f031f8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 23 Aug 2022 04:10:56 +0530 Subject: [PATCH 078/284] [extractor/mixcloud] All formats are audio-only Closes #4740 --- yt_dlp/extractor/mixcloud.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index a77d7e6824..becc56a2b9 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -159,6 +159,7 @@ def _real_extract(self, url): formats.append({ 'format_id': 'http', 'url': decrypted, + 'vcodec': 'none', 'downloader_options': { # Mixcloud starts throttling at >~5M 'http_chunk_size': 5242880, From 5314b521925498356e78652fe59866116d56e1d1 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 Aug 2022 07:38:55 +0530 Subject: [PATCH 079/284] [utils] Add orderedSet_from_options --- yt_dlp/YoutubeDL.py | 27 ++++++--------------------- yt_dlp/options.py | 35 +++++++++++------------------------ yt_dlp/utils.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 45 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index c2b306d70c..872e0bdc3c 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -115,6 +115,7 @@ network_exceptions, number_of_digits, orderedSet, + orderedSet_from_options, parse_filesize, preferredencoding, prepend_extension, @@ -2737,27 +2738,11 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): if self.params.get('allsubtitles', False): requested_langs = all_sub_langs elif self.params.get('subtitleslangs', False): - # A list is used so that the order of languages will be the same as - # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041 - requested_langs = [] - for lang_re in self.params.get('subtitleslangs'): - discard = lang_re[0] == '-' - if discard: - lang_re = lang_re[1:] - if lang_re == 'all': - if discard: - requested_langs = [] - else: - requested_langs.extend(all_sub_langs) - continue - current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) - if discard: - for lang in current_langs: - while lang in requested_langs: - requested_langs.remove(lang) - else: - requested_langs.extend(current_langs) - requested_langs = orderedSet(requested_langs) + try: + requested_langs = orderedSet_from_options( + self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}') elif normal_sub_langs: requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1] else: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 6373ff8c0a..0cddb7fd52 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -29,6 +29,7 @@ format_field, get_executable_path, join_nonempty, + orderedSet_from_options, remove_end, write_string, ) @@ -232,30 +233,16 @@ def _list_from_options_callback(option, opt_str, value, parser, append=True, del current + value if append is True else value + current) def _set_from_options_callback( - option, opt_str, value, parser, delim=',', allowed_values=None, aliases={}, + option, opt_str, value, parser, allowed_values, delim=',', aliases={}, process=lambda x: x.lower().strip()): - current = set(getattr(parser.values, option.dest)) - values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1])) - while values: - actual_val = val = values.pop() - if not val: - raise optparse.OptionValueError(f'Invalid {option.metavar} for {opt_str}: {value}') - if val == 'all': - current.update(allowed_values) - elif val == '-all': - current = set() - elif val in aliases: - values.extend(aliases[val]) - else: - if val[0] == '-': - val = val[1:] - current.discard(val) - else: - current.update([val]) - if allowed_values is not None and val not in allowed_values: - raise optparse.OptionValueError(f'wrong {option.metavar} for {opt_str}: {actual_val}') + values = [process(value)] if delim is None else map(process, value.split(delim)) + try: + requested = orderedSet_from_options(values, collections.ChainMap(aliases, {'all': allowed_values}), + start=getattr(parser.values, option.dest)) + except ValueError as e: + raise optparse.OptionValueError(f'wrong {option.metavar} for {opt_str}: {e.args[0]}') - setattr(parser.values, option.dest, current) + setattr(parser.values, option.dest, set(requested)) def _dict_from_options_callback( option, opt_str, value, parser, @@ -447,8 +434,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', }, 'aliases': { - 'youtube-dl': ['-multistreams', 'all'], - 'youtube-dlc': ['-no-youtube-channel-redirect', '-no-live-chat', 'all'], + 'youtube-dl': ['all', '-multistreams'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 13768d8469..957c7eaa79 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5785,6 +5785,36 @@ def truncate_string(s, left, right=0): return f'{s[:left-3]}...{s[-right:]}' +def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None): + assert 'all' in alias_dict, '"all" alias is required' + requested = list(start or []) + for val in options: + discard = val.startswith('-') + if discard: + val = val[1:] + + if val in alias_dict: + val = alias_dict[val] if not discard else [ + i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]] + # NB: Do not allow regex in aliases for performance + requested = orderedSet_from_options(val, alias_dict, start=requested) + continue + + current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex + else [val] if val in alias_dict['all'] else None) + if current is None: + raise ValueError(val) + + if discard: + for item in current: + while item in requested: + requested.remove(item) + else: + requested.extend(current) + + return orderedSet(requested) + + # Deprecated has_certifi = bool(certifi) has_websockets = bool(websockets) From fe7866d0ed6bfa3904ce12b049a3424fdc0ea1fa Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 Aug 2022 05:42:16 +0530 Subject: [PATCH 080/284] Add option `--use-extractors` Deprecates `--force-generic-extractor` Closes #3234, Closes #2044 Related: #4307, #1791 --- README.md | 9 ++++++++- yt_dlp/YoutubeDL.py | 41 +++++++++++++++++++++++--------------- yt_dlp/__init__.py | 1 + yt_dlp/extractor/common.py | 13 ++++++++++++ yt_dlp/options.py | 12 ++++++++++- 5 files changed, 58 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 7cfeec4f12..aab20c079f 100644 --- a/README.md +++ b/README.md @@ -375,7 +375,13 @@ ## General Options: --list-extractors List all supported extractors and exit --extractor-descriptions Output descriptions of all supported extractors and exit - --force-generic-extractor Force extraction to use the generic extractor + --use-extractors, --ies NAMES Extractor names to use separated by commas. + You can also use regexes, "all", "default" + and "end" (end URL matching); e.g. --ies + "holodex.*,end,youtube". Prefix the name + with a "-" to exclude it, e.g. --ies + default,-generic. Use --list-extractors for + a list of available extractor names --default-search PREFIX Use this prefix for unqualified URLs. E.g. "gvsearch2:python" downloads two videos from google videos for the search term "python". @@ -2058,6 +2064,7 @@ #### Redundant options #### Not recommended While these options still work, their use is not recommended since there are other alternatives to achieve the same + --force-generic-extractor --ies generic,default --exec-before-download CMD --exec "before_dl:CMD" --no-exec-before-download --no-exec --all-formats -f all diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 872e0bdc3c..a3d5620425 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -29,6 +29,7 @@ from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor +from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper from .minicurses import format_text from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors @@ -237,7 +238,7 @@ class YoutubeDL: Default is 'only_download' for CLI, but False for API skip_playlist_after_errors: Number of allowed failures until the rest of the playlist is skipped - force_generic_extractor: Force downloader to use the generic extractor + allowed_extractors: List of regexes to match against extractor names that are allowed overwrites: Overwrite all video and metadata files if True, overwrite only non-video files if None and don't overwrite any file if False @@ -477,6 +478,8 @@ class YoutubeDL: The following options are deprecated and may be removed in the future: + force_generic_extractor: Force downloader to use the generic extractor + - Use allowed_extractors = ['generic', 'default'] playliststart: - Use playlist_items Playlist item to start at. playlistend: - Use playlist_items @@ -758,13 +761,6 @@ def add_info_extractor(self, ie): self._ies_instances[ie_key] = ie ie.set_downloader(self) - def _get_info_extractor_class(self, ie_key): - ie = self._ies.get(ie_key) - if ie is None: - ie = get_info_extractor(ie_key) - self.add_info_extractor(ie) - return ie - def get_info_extractor(self, ie_key): """ Get an instance of an IE with name ie_key, it will try to get one from @@ -781,8 +777,19 @@ def add_default_info_extractors(self): """ Add the InfoExtractors returned by gen_extractors to the end of the list """ - for ie in gen_extractor_classes(): - self.add_info_extractor(ie) + all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()} + all_ies['end'] = UnsupportedURLIE() + try: + ie_names = orderedSet_from_options( + self.params.get('allowed_extractors', ['default']), { + 'all': list(all_ies), + 'default': [name for name, ie in all_ies.items() if ie._ENABLED], + }, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}') + for name in ie_names: + self.add_info_extractor(all_ies[name]) + self.write_debug(f'Loaded {len(ie_names)} extractors') def add_post_processor(self, pp, when='post_process'): """Add a PostProcessor object to the end of the chain.""" @@ -1413,11 +1420,11 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None, ie_key = 'Generic' if ie_key: - ies = {ie_key: self._get_info_extractor_class(ie_key)} + ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {} else: ies = self._ies - for ie_key, ie in ies.items(): + for key, ie in ies.items(): if not ie.suitable(url): continue @@ -1426,14 +1433,16 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None, 'and will probably not work.') temp_id = ie.get_temp_id(url) - if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive') + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}): + self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive') if self.params.get('break_on_existing', False): raise ExistingVideoReached() break - return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) + return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default']) + self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}', + tb=False if extractors_restricted else None) def _handle_extraction_exceptions(func): @functools.wraps(func) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 317dd26231..e9234e6f49 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -766,6 +766,7 @@ def parse_options(argv=None): 'windowsfilenames': opts.windowsfilenames, 'ignoreerrors': opts.ignoreerrors, 'force_generic_extractor': opts.force_generic_extractor, + 'allowed_extractors': opts.allowed_extractors or ['default'], 'ratelimit': opts.ratelimit, 'throttledratelimit': opts.throttledratelimit, 'overwrites': opts.overwrites, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index a534703e53..6337a13a44 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -480,6 +480,9 @@ class InfoExtractor: will be used by geo restriction bypass mechanism similarly to _GEO_COUNTRIES. + The _ENABLED attribute should be set to False for IEs that + are disabled by default and must be explicitly enabled. + The _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -491,6 +494,7 @@ class InfoExtractor: _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True + _ENABLED = True _NETRC_MACHINE = None IE_DESC = None SEARCH_KEY = None @@ -3941,3 +3945,12 @@ def _search_results(self, query): @classproperty def SEARCH_KEY(cls): return cls._SEARCH_KEY + + +class UnsupportedURLIE(InfoExtractor): + _VALID_URL = '.*' + _ENABLED = False + IE_DESC = False + + def _real_extract(self, url): + raise UnsupportedError(url) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 0cddb7fd52..bee531d1b5 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -353,10 +353,20 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', default=False, help='Output descriptions of all supported extractors and exit') + general.add_option( + '--use-extractors', '--ies', + action='callback', dest='allowed_extractors', metavar='NAMES', type='str', + default=[], callback=_list_from_options_callback, + help=( + 'Extractor names to use separated by commas. ' + 'You can also use regexes, "all", "default" and "end" (end URL matching); ' + 'e.g. --ies "holodex.*,end,youtube". ' + 'Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. ' + 'Use --list-extractors for a list of available extractor names')) general.add_option( '--force-generic-extractor', action='store_true', dest='force_generic_extractor', default=False, - help='Force extraction to use the generic extractor') + help=optparse.SUPPRESS_HELP) general.add_option( '--default-search', dest='default_search', metavar='PREFIX', From fd404bec7e6314c4584fedb1b595ee5e2d1225a6 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 Aug 2022 08:00:13 +0530 Subject: [PATCH 081/284] Fix `--break-per-url --max-downloads` --- README.md | 4 ++-- yt_dlp/YoutubeDL.py | 1 + yt_dlp/options.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index aab20c079f..e49190ab2c 100644 --- a/README.md +++ b/README.md @@ -530,8 +530,8 @@ ## Video Selection: a file that is in the archive --break-on-reject Stop the download process when encountering a file that has been filtered out - --break-per-input Make --break-on-existing, --break-on-reject - and --max-downloads act only on the current + --break-per-input Make --break-on-existing, --break-on-reject, + --max-downloads and autonumber reset per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a3d5620425..e1bbb01fa2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3265,6 +3265,7 @@ def wrapper(*args, **kwargs): self.to_screen(f'[info] {e}') if not self.params.get('break_per_url'): raise + self._num_downloads = 0 else: if self.params.get('dump_single_json', False): self.post_extract(res) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index bee531d1b5..5e15812963 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -632,7 +632,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Make --break-on-existing, --break-on-reject and --max-downloads act only on the current input URL') + help='Make --break-on-existing, --break-on-reject, --max-downloads and autonumber reset per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', From 2516cafb28293612cfb6e158dac34a3117b42461 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 Aug 2022 08:20:52 +0530 Subject: [PATCH 082/284] Fix bug in fe7866d0ed6bfa3904ce12b049a3424fdc0ea1fa --- README.md | 4 ++-- yt_dlp/extractor/generic.py | 3 +-- yt_dlp/options.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e49190ab2c..8957711ddb 100644 --- a/README.md +++ b/README.md @@ -375,13 +375,13 @@ ## General Options: --list-extractors List all supported extractors and exit --extractor-descriptions Output descriptions of all supported extractors and exit - --use-extractors, --ies NAMES Extractor names to use separated by commas. + --use-extractors NAMES Extractor names to use separated by commas. You can also use regexes, "all", "default" and "end" (end URL matching); e.g. --ies "holodex.*,end,youtube". Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. Use --list-extractors for - a list of available extractor names + a list of extractor names. (Alias: --ies) --default-search PREFIX Use this prefix for unqualified URLs. E.g. "gvsearch2:python" downloads two videos from google videos for the search term "python". diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index e32ec1c8fa..b65194c604 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -3,7 +3,6 @@ import urllib.parse import xml.etree.ElementTree -from . import gen_extractor_classes from .common import InfoExtractor # isort: split from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE from .commonprotocols import RtmpIE @@ -2805,7 +2804,7 @@ def _real_extract(self, url): self._downloader.write_debug('Looking for embeds') embeds = [] - for ie in gen_extractor_classes(): + for ie in self._downloader._ies.values(): gen = ie.extract_from_webpage(self._downloader, url, webpage) current_embeds = [] try: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 5e15812963..50bba9b633 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -362,7 +362,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'You can also use regexes, "all", "default" and "end" (end URL matching); ' 'e.g. --ies "holodex.*,end,youtube". ' 'Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. ' - 'Use --list-extractors for a list of available extractor names')) + 'Use --list-extractors for a list of extractor names. (Alias: --ies)')) general.add_option( '--force-generic-extractor', action='store_true', dest='force_generic_extractor', default=False, From b5e7a2e69d94d68d47586452e6014e03cf2a2805 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 Aug 2022 13:03:33 +0530 Subject: [PATCH 083/284] Add version to infojson --- yt_dlp/YoutubeDL.py | 25 +++++++++++-------------- yt_dlp/update.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e1bbb01fa2..4330006ccb 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -48,7 +48,7 @@ get_postprocessor, ) from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping -from .update import detect_variant +from .update import REPOSITORY, current_git_head, detect_variant from .utils import ( DEFAULT_OUTTMPL, IDENTITY, @@ -3314,6 +3314,12 @@ def sanitize_info(info_dict, remove_private_keys=False): return info_dict info_dict.setdefault('epoch', int(time.time())) info_dict.setdefault('_type', 'video') + info_dict.setdefault('_version', { + 'version': __version__, + 'current_git_head': current_git_head(), + 'release_git_head': RELEASE_GIT_HEAD, + 'repository': REPOSITORY, + }) if remove_private_keys: reject = lambda k, v: v is None or k.startswith('__') or k in { @@ -3678,7 +3684,8 @@ def get_encoding(stream): if VARIANT not in (None, 'pip'): source += '*' write_debug(join_nonempty( - 'yt-dlp version', __version__, + f'{"yt-dlp" if REPOSITORY == "yt-dlp/yt-dlp" else REPOSITORY} version', + __version__, f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', delim=' ')) @@ -3694,18 +3701,8 @@ def get_encoding(stream): if self.params['compat_opts']: write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) - if source == 'source': - try: - stdout, _, _ = Popen.run( - ['git', 'rev-parse', '--short', 'HEAD'], - text=True, cwd=os.path.dirname(os.path.abspath(__file__)), - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if re.fullmatch('[0-9a-f]+', stdout.strip()): - write_debug(f'Git HEAD: {stdout.strip()}') - except Exception: - with contextlib.suppress(Exception): - sys.exc_clear() - + if current_git_head(): + write_debug(f'Git HEAD: {current_git_head()}') write_debug(system_identifier()) exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index fc96f29850..e82cdf451a 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -1,4 +1,5 @@ import atexit +import contextlib import hashlib import json import os @@ -50,6 +51,19 @@ def detect_variant(): return VARIANT or _get_variant_and_executable_path()[0] +@functools.cache +def current_git_head(): + if detect_variant() != 'source': + return + with contextlib.suppress(Exception): + stdout, _, _ = Popen.run( + ['git', 'rev-parse', '--short', 'HEAD'], + text=True, cwd=os.path.dirname(os.path.abspath(__file__)), + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if re.fullmatch('[0-9a-f]+', stdout.strip()): + return stdout.strip() + + _FILE_SUFFIXES = { 'zip': '', 'py2exe': '_min.exe', From e5458d1d88fcc81011ab19ba610c4b37946c9fa9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 24 Aug 2022 15:10:21 +0530 Subject: [PATCH 084/284] Fix lazy extractor bug in fe7866d0ed6bfa3904ce12b049a3424fdc0ea1fa and add test Fixes https://github.com/yt-dlp/yt-dlp/pull/3234#issuecomment-1225347071 --- devscripts/lazy_load_template.py | 11 +++++--- devscripts/make_lazy_extractors.py | 4 ++- test/test_execution.py | 45 +++++++++++++++++------------- yt_dlp/extractor/testurl.py | 4 ++- 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index a6e26b6f63..626b85d620 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -11,14 +11,17 @@ # These bloat the lazy_extractors, so allow them to passthrough silently ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'} +_WARNED = False class LazyLoadMetaClass(type): def __getattr__(cls, name): - if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS: - write_string( - 'WARNING: Falling back to normal extractor since lazy extractor ' - f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n') + global _WARNED + if ('_real_class' not in cls.__dict__ + and name not in ALLOWED_CLASSMETHODS and not _WARNED): + _WARNED = True + write_string('WARNING: Falling back to normal extractor since lazy extractor ' + f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n') return getattr(cls.real_class, name) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 01bd88ae61..43885331f8 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -12,7 +12,9 @@ from devscripts.utils import get_filename_args, read_file, write_file NO_ATTR = object() -STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit'] +STATIC_CLASS_PROPERTIES = [ + 'IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_ENABLED', '_NETRC_MACHINE', 'age_limit' +] CLASS_METHODS = [ 'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable' ] diff --git a/test/test_execution.py b/test/test_execution.py index 1d15fddabc..7a9e800b66 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -11,41 +11,46 @@ import contextlib import subprocess -from yt_dlp.utils import encodeArgument +from yt_dlp.utils import Popen rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - -try: - _DEV_NULL = subprocess.DEVNULL -except AttributeError: - _DEV_NULL = open(os.devnull, 'wb') +LAZY_EXTRACTORS = 'yt_dlp/extractor/lazy_extractors.py' class TestExecution(unittest.TestCase): - def test_import(self): - subprocess.check_call([sys.executable, '-c', 'import yt_dlp'], cwd=rootDir) - - def test_module_exec(self): - subprocess.check_call([sys.executable, '-m', 'yt_dlp', '--ignore-config', '--version'], cwd=rootDir, stdout=_DEV_NULL) + def run_yt_dlp(self, exe=(sys.executable, 'yt_dlp/__main__.py'), opts=('--version', )): + stdout, stderr, returncode = Popen.run( + [*exe, '--ignore-config', *opts], cwd=rootDir, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + print(stderr, file=sys.stderr) + self.assertEqual(returncode, 0) + return stdout.strip(), stderr.strip() def test_main_exec(self): - subprocess.check_call([sys.executable, 'yt_dlp/__main__.py', '--ignore-config', '--version'], cwd=rootDir, stdout=_DEV_NULL) + self.run_yt_dlp() + + def test_import(self): + self.run_yt_dlp(exe=(sys.executable, '-c', 'import yt_dlp')) + + def test_module_exec(self): + self.run_yt_dlp(exe=(sys.executable, '-m', 'yt_dlp')) def test_cmdline_umlauts(self): - p = subprocess.Popen( - [sys.executable, 'yt_dlp/__main__.py', '--ignore-config', encodeArgument('ä'), '--version'], - cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) - _, stderr = p.communicate() + _, stderr = self.run_yt_dlp(opts=('ä', '--version')) self.assertFalse(stderr) def test_lazy_extractors(self): try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) - subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', LAZY_EXTRACTORS], + cwd=rootDir, stdout=subprocess.DEVNULL) + self.assertTrue(os.path.exists(LAZY_EXTRACTORS)) + + _, stderr = self.run_yt_dlp(opts=('-s', 'test:')) + self.assertFalse(stderr) + + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=subprocess.DEVNULL) finally: with contextlib.suppress(OSError): - os.remove('yt_dlp/extractor/lazy_extractors.py') + os.remove(LAZY_EXTRACTORS) if __name__ == '__main__': diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py index d205fe053f..2bce3b239a 100644 --- a/yt_dlp/extractor/testurl.py +++ b/yt_dlp/extractor/testurl.py @@ -8,12 +8,14 @@ class TestURLIE(InfoExtractor): """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list - _VALID_URL = r'test(?:url)?:(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?$' + _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>[0-9]+))?$' def _real_extract(self, url): from . import gen_extractor_classes extractor_id, num = self._match_valid_url(url).group('extractor', 'num') + if not extractor_id: + return {'id': ':test', 'title': '', 'url': url} rex = re.compile(extractor_id, flags=re.IGNORECASE) matching_extractors = [e for e in gen_extractor_classes() if rex.search(e.IE_NAME)] From 164b03c4864b0d44cfee5e7702f7c2317164a6cf Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 25 Aug 2022 09:36:32 +0530 Subject: [PATCH 085/284] [jsinterp] Fix bug in operator precedence Fixes https://github.com/yt-dlp/yt-dlp/issues/4635#issuecomment-1226659543 --- test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4b526ff2e2..2f124a738c 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -114,6 +114,10 @@ 'https://www.youtube.com/s/player/009f1d77/player_ias.vflset/en_US/base.js', '5dwFHw8aFWQUQtffRq', 'audescmLUzI3jw', ), + ( + 'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js', + '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 2b68f53fae..1995e9d0e1 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -98,8 +98,8 @@ def _js_ternary(cndn, if_true=True, if_false=False): '&': _js_bit_op(operator.and_), '===': operator.is_, - '==': _js_eq_op(operator.eq), '!==': operator.is_not, + '==': _js_eq_op(operator.eq), '!=': _js_eq_op(operator.ne), '<=': _js_comp_op(operator.le), From ca7f8b8f3150ad80e8a0de97e0b6f53df944e3d9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 26 Aug 2022 06:07:47 +0530 Subject: [PATCH 086/284] Bugfix for 822d66e591341f8bf082be371b4beb66d72ba080 Closes #4760 --- yt_dlp/options.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 50bba9b633..a0db9bc028 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -164,6 +164,7 @@ def format_option_strings(option): class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since python 3.2. So assume a stable interface even for private methods + ALIAS_DEST = '_triggered_aliases' ALIAS_TRIGGER_LIMIT = 100 def __init__(self): @@ -175,6 +176,7 @@ def __init__(self): formatter=_YoutubeDLHelpFormatter(), conflict_handler='resolve', ) + self.set_default(self.ALIAS_DEST, collections.defaultdict(int)) _UNKNOWN_OPTION = (optparse.BadOptionError, optparse.AmbiguousOptionError) _BAD_OPTION = optparse.OptionValueError @@ -290,11 +292,9 @@ def _create_alias(option, opt_str, value, parser): parser.add_option_group(alias_group) aliases = (x if x.startswith('-') else f'--{x}' for x in map(str.strip, aliases.split(','))) - DEST = '_triggered_aliases' - setattr(parser.values, DEST, collections.defaultdict(int)) try: alias_group.add_option( - *aliases, help=opts, nargs=nargs, dest=DEST, type='str' if nargs else None, + *aliases, help=opts, nargs=nargs, dest=parser.ALIAS_DEST, type='str' if nargs else None, metavar=' '.join(f'ARG{i}' for i in range(nargs)), action='callback', callback=_alias_callback, callback_kwargs={'opts': opts, 'nargs': nargs}) except Exception as err: From 1d64a59547d1c674de5750d4581131ec8e2d280e Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 26 Aug 2022 06:28:37 +0530 Subject: [PATCH 087/284] [extractor/vimeo:user] Fix _VALID_URL Closes #4758 --- yt_dlp/extractor/vimeo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 9e17149bed..25d2f200f2 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -1131,7 +1131,7 @@ def _real_extract(self, url): class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos)?/?(?:$|[?#])' _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', @@ -1140,6 +1140,9 @@ class VimeoUserIE(VimeoChannelIE): 'id': 'nkistudio', }, 'playlist_mincount': 66, + }, { + 'url': 'https://vimeo.com/nkistudio/', + 'only_matching': True, }] _BASE_URL_TEMPL = 'https://vimeo.com/%s' From a1af516259127d4d82bae01088b654ff980bc863 Mon Sep 17 00:00:00 2001 From: Shreyas Minocha <11537232+shreyasminocha@users.noreply.github.com> Date: Thu, 25 Aug 2022 20:29:45 -0700 Subject: [PATCH 088/284] [extractor/screencastomatic] Support `--video-password` (#4761) Authored by: shreyasminocha --- yt_dlp/extractor/screencastomatic.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/yt_dlp/extractor/screencastomatic.py b/yt_dlp/extractor/screencastomatic.py index f2f281f479..28e25e9d8b 100644 --- a/yt_dlp/extractor/screencastomatic.py +++ b/yt_dlp/extractor/screencastomatic.py @@ -1,10 +1,12 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, get_element_by_class, int_or_none, remove_start, strip_or_none, unified_strdate, + urlencode_postdata, ) @@ -34,6 +36,28 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://screencast-o-matic.com/player/' + video_id, video_id) + + if (self._html_extract_title(webpage) == 'Protected Content' + or 'This video is private and requires a password' in webpage): + password = self.get_param('videopassword') + + if not password: + raise ExtractorError('Password protected video, use --video-password <password>', expected=True) + + form = self._search_regex( + r'(?is)<form[^>]*>(?P<form>.+?)</form>', webpage, 'login form', group='form') + form_data = self._hidden_inputs(form) + form_data.update({ + 'scPassword': password, + }) + + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/password', video_id, 'Logging in', + data=urlencode_postdata(form_data)) + + if '<small class="text-danger">Invalid password</small>' in webpage: + raise ExtractorError('Unable to login: Invalid password', expected=True) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] info.update({ 'id': video_id, From 89e4d86171c7b7c997c77d4714542e0383bf0db0 Mon Sep 17 00:00:00 2001 From: cgrigis <20282170+cgrigis@users.noreply.github.com> Date: Sat, 27 Aug 2022 02:28:01 +0200 Subject: [PATCH 089/284] [extractor/arte] Bug fix (#4769) Closes #4768 Authored by: cgrigis --- yt_dlp/extractor/arte.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 980d37849f..25ecb42301 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -95,24 +95,24 @@ class ArteTVIE(ArteTVBaseIE): # all obtained by exhaustive testing _COUNTRIES_MAP = { - 'DE_FR': { + 'DE_FR': ( 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', 'YT', - }, + ), # with both of the below 'BE' sometimes works, sometimes doesn't - 'EUR_DE_FR': { + 'EUR_DE_FR': ( 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI', 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', 'YT', - }, - 'SAT': { + ), + 'SAT': ( 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF', 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI', 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC', 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO', 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT', - }, + ), } def _real_extract(self, url): From 4e4982ab5b259027b39a6f9013ec96aefce78aa1 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 27 Aug 2022 06:20:48 +0530 Subject: [PATCH 090/284] [extractor/generic] Don't return JW player without formats CLoses #4765 --- yt_dlp/extractor/generic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index b65194c604..f53122b20c 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -25,6 +25,7 @@ parse_resolution, smuggle_url, str_or_none, + traverse_obj, try_call, unescapeHTML, unified_timestamp, @@ -2839,8 +2840,9 @@ def _real_extract(self, url): try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) - self.report_detected('JW Player data') - return merge_dicts(info, info_dict) + if traverse_obj(info, 'formats', ('entries', ..., 'formats')): + self.report_detected('JW Player data') + return merge_dicts(info, info_dict) except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass From 5e01315aa1ad0c56be33cb5b6a4d079068ee7145 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 27 Aug 2022 07:22:48 +0530 Subject: [PATCH 091/284] [cache, extractor/youtube] Invalidate old cache --- yt_dlp/cache.py | 19 ++++++++++++++----- yt_dlp/extractor/youtube.py | 2 +- yt_dlp/version.py | 2 +- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index 83351b7976..602cb9edba 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -6,7 +6,8 @@ import shutil import traceback -from .utils import expand_path, write_json_file +from .utils import expand_path, traverse_obj, version_tuple, write_json_file +from .version import __version__ class Cache: @@ -45,12 +46,20 @@ def store(self, section, key, data, dtype='json'): if ose.errno != errno.EEXIST: raise self._ydl.write_debug(f'Saving {section}.{key} to cache') - write_json_file(data, fn) + write_json_file({'yt-dlp_version': __version__, 'data': data}, fn) except Exception: tb = traceback.format_exc() self._ydl.report_warning(f'Writing cache to {fn!r} failed: {tb}') - def load(self, section, key, dtype='json', default=None): + def _validate(self, data, after): + version = traverse_obj(data, 'yt-dlp_version') + if not version: # Backward compatibility + data, version = {'data': data}, '2022.08.19' + if not after or version_tuple(version) > version_tuple(after): + return data['data'] + self._ydl.write_debug(f'Discarding old cache from version {version} (need {after})') + + def load(self, section, key, dtype='json', default=None, *, after=None): assert dtype in ('json',) if not self.enabled: @@ -61,8 +70,8 @@ def load(self, section, key, dtype='json', default=None): try: with open(cache_fn, encoding='utf-8') as cachef: self._ydl.write_debug(f'Loading {section}.{key} from cache') - return json.load(cachef) - except ValueError: + return self._validate(json.load(cachef), after) + except (ValueError, KeyError): try: file_size = os.path.getsize(cache_fn) except OSError as oe: diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e9f8adbd15..38e5faa794 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2659,7 +2659,7 @@ def _extract_n_function_name(self, jscode): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id) + func_code = self.cache.load('youtube-nsig', player_id, after='2022.08.19') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 45f670b091..1ded15df42 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,6 +1,6 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.08.19' +__version__ = '2022.08.19.1' RELEASE_GIT_HEAD = '48c88e088' From e0992d555879b07ac7622dfac1f88f9e76e32923 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Sun, 28 Aug 2022 01:37:25 +0900 Subject: [PATCH 092/284] [extractor/IslamChannel] Add extractors (#4779) Authored by: Lesmiscore --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/islamchannel.py | 82 ++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 yt_dlp/extractor/islamchannel.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1a355b2dc3..60e1b716f1 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -720,6 +720,10 @@ IqIE, IqAlbumIE ) +from .islamchannel import ( + IslamChannelIE, + IslamChannelSeriesIE, +) from .itprotv import ( ITProTVIE, ITProTVCourseIE diff --git a/yt_dlp/extractor/islamchannel.py b/yt_dlp/extractor/islamchannel.py new file mode 100644 index 0000000000..bac852b12d --- /dev/null +++ b/yt_dlp/extractor/islamchannel.py @@ -0,0 +1,82 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj, urljoin + + +class IslamChannelIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/watch/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/watch/38604310', + 'info_dict': { + 'id': '38604310', + 'title': 'Omar - Young Omar', + 'description': 'md5:5cc7ddecef064ea7afe52eb5e0e33b55', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + thumbnail = self._search_regex( + r'data-poster="([^"]+)"', webpage, 'data poster', fatal=False) or \ + self._html_search_meta(('og:image', 'twitter:image'), webpage) + + headers = { + 'Token': self._search_regex(r'data-token="([^"]+)"', webpage, 'data token'), + 'Token-Expiry': self._search_regex(r'data-expiry="([^"]+)"', webpage, 'data expiry'), + 'Uvid': video_id, + } + show_stream = self._download_json( + f'https://v2-streams-elb.simplestreamcdn.com/api/show/stream/{video_id}', video_id, + query={ + 'key': self._search_regex(r'data-key="([^"]+)"', webpage, 'data key'), + 'platform': 'chrome', + }, headers=headers) + # TODO: show_stream['stream'] and show_stream['drm'] may contain something interesting + streams = self._download_json( + traverse_obj(show_stream, ('response', 'tokenization', 'url')), video_id, + headers=headers) + formats, subs = self._extract_m3u8_formats_and_subtitles(traverse_obj(streams, ('Streams', 'Adaptive')), video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_meta(('og:title', 'twitter:title'), webpage), + 'description': self._html_search_meta(('og:description', 'twitter:description', 'description'), webpage), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': [{ + 'id': 'unscaled', + 'url': thumbnail.split('?')[0], + 'ext': 'jpg', + 'preference': 2, + }, { + 'id': 'orig', + 'url': thumbnail, + 'ext': 'jpg', + 'preference': 1, + }] if thumbnail else None, + } + + +class IslamChannelSeriesIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/series/(?P<id>[a-f\d-]+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/series/a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + 'info_dict': { + 'id': 'a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + }, + 'playlist_mincount': 31, + }] + + def _real_extract(self, url): + pl_id = self._match_id(url) + webpage = self._download_webpage(url, pl_id) + + return self.playlist_from_matches( + re.finditer(r'<a\s+href="(/watch/\d+)"[^>]+?data-video-type="show">', webpage), + pl_id, getter=lambda x: urljoin(url, x.group(1)), ie=IslamChannelIE) From 50ac0e5416e0bdff21241852010cad4927e898d6 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sun, 28 Aug 2022 22:59:54 +0000 Subject: [PATCH 093/284] [extractor/youtube] Use device-specific user agent (#4770) Thwart latest fingerprinting attempt (see https://github.com/iv-org/invidious/issues/3230#issuecomment-1226887639) Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 44 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 38e5faa794..f55a2760ff 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -110,8 +110,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '17.29.34', - 'androidSdkVersion': 30 + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, @@ -122,8 +123,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '17.29.34', - 'androidSdkVersion': 30 + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, @@ -135,7 +137,8 @@ 'client': { 'clientName': 'ANDROID_MUSIC', 'clientVersion': '5.16.51', - 'androidSdkVersion': 30 + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.music/5.16.51 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, @@ -146,8 +149,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '22.28.100', - 'androidSdkVersion': 30 + 'clientVersion': '22.30.100', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, @@ -162,6 +166,7 @@ 'clientName': 'IOS', 'clientVersion': '17.30.1', 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/17.30.1 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, @@ -173,6 +178,7 @@ 'clientName': 'IOS_MESSAGES_EXTENSION', 'clientVersion': '17.30.1', 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/17.30.1 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, @@ -555,7 +561,8 @@ def generate_api_headers( 'Origin': origin, 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), - 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg) + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), + 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client) } if session_index is None: session_index = self._extract_session_index(ytcfg) @@ -3071,7 +3078,9 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr): + _STORY_PLAYER_PARAMS = '8AEB' + + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): session_index = self._extract_session_index(player_ytcfg, master_ytcfg) syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) @@ -3081,8 +3090,10 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, - 'params': '8AEB' # enable stories } + if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': + yt_query['params'] = self._STORY_PLAYER_PARAMS + yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3115,7 +3126,7 @@ def _get_requested_clients(self, url, smuggled_data): return orderedSet(requested_clients) - def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg): + def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): initial_pr = None if webpage: initial_pr = self._search_json( @@ -3165,7 +3176,7 @@ def append_client(*client_names): try: pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr) + client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data) except ExtractorError as e: if last_error: self.report_warning(last_error) @@ -3428,14 +3439,17 @@ def _extract_storyboard(self, player_responses, duration): def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): + query = {'bpctr': '9999999999', 'has_verified': '1'} + if smuggled_data.get('is_story'): + query['pp'] = self._STORY_PLAYER_PARAMS webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999&has_verified=1&pp=8AEB', video_id, fatal=False) + webpage_url, video_id, fatal=False, query=query) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() player_responses, player_url = self._extract_player_responses( self._get_requested_clients(url, smuggled_data), - video_id, webpage, master_ytcfg) + video_id, webpage, master_ytcfg, smuggled_data) return webpage, master_ytcfg, player_responses, player_url @@ -6008,7 +6022,7 @@ class YoutubeStoriesIE(InfoExtractor): def _real_extract(self, url): playlist_id = f'RLTD{self._match_id(url)}' return self.url_result( - f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', + smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}), ie=YoutubeTabIE, video_id=playlist_id) From 224b5a35f7f17fec5639608d31074b8048369385 Mon Sep 17 00:00:00 2001 From: Samantaz Fox <coding@samantaz.fr> Date: Mon, 29 Aug 2022 05:36:55 +0200 Subject: [PATCH 094/284] [extractor/youtube] Update iOS Innertube clients (#4792) Authored by: SamantazFox --- yt_dlp/extractor/youtube.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index f55a2760ff..d66732c2fb 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -164,9 +164,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '17.30.1', + 'clientVersion': '17.33.2', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/17.30.1 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, @@ -176,9 +176,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '17.30.1', + 'clientVersion': '17.33.2', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/17.30.1 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, @@ -189,7 +189,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '5.18', + 'clientVersion': '5.21', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtubemusic/5.21 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -199,7 +201,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '22.29.101', + 'clientVersion': '22.33.101', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, From c4b2df872d0ab49da939bf8bda001fa4e2d2ea06 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 30 Aug 2022 15:57:17 +0530 Subject: [PATCH 095/284] [jsinterp] Fix `_separate` Ref: https://github.com/yt-dlp/yt-dlp/issues/4635#issuecomment-1231126941 --- test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 2 +- yt_dlp/jsinterp.py | 4 ++-- yt_dlp/version.py | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 2f124a738c..717c949540 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -118,6 +118,10 @@ 'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js', '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ', ), + ( + 'https://www.youtube.com/s/player/113ca41c/player_ias.vflset/en_US/base.js', + 'cgYl-tlYkhjT7A', 'hI7BBr2zUgcmMg', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d66732c2fb..b30dadf9f0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2670,7 +2670,7 @@ def _extract_n_function_name(self, jscode): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, after='2022.08.19') + func_code = self.cache.load('youtube-nsig', player_id, after='2022.08.19.1') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 1995e9d0e1..cadb013a31 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -226,7 +226,7 @@ def _regex_flags(cls, expr): @staticmethod def _separate(expr, delim=',', max_split=None): - OP_CHARS = '+-*/%&|^=<>!,;' + OP_CHARS = '+-*/%&|^=<>!,;{}()[]:' if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} @@ -243,7 +243,7 @@ def _separate(expr, delim=',', max_split=None): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and char in OP_CHARS or (char == ' ' and after_op) + after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op) if char != delim[pos] or any(counters.values()) or in_quote: pos = 0 diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 1ded15df42..8bfe0a09b4 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,6 +1,6 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.08.19.1' +__version__ = '2022.08.19.2' RELEASE_GIT_HEAD = '48c88e088' From 5135ed3d4a87b3c03902aec68b60b40855b12863 Mon Sep 17 00:00:00 2001 From: OHaiiBuzzle <23693150+ohaiibuzzle@users.noreply.github.com> Date: Tue, 30 Aug 2022 17:44:16 +0700 Subject: [PATCH 096/284] [extractor/huya] Fix stream extraction (#4798) Closes #4658 Authored by: ohaiibuzzle --- yt_dlp/extractor/huya.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index 9dd5e41b3e..6d6f099561 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -6,7 +6,6 @@ from ..utils import ( ExtractorError, int_or_none, - js_to_json, str_or_none, try_get, unescapeHTML, @@ -55,11 +54,7 @@ class HuyaLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None) - if not json_stream: - raise ExtractorError('Video is offline', expected=True) - stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id, - transform_source=js_to_json) + stream_data = self._search_json(r'stream:\s+', webpage, 'stream', video_id=video_id, default=None) room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) @@ -67,6 +62,8 @@ def _real_extract(self, url): screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] + if not stream_info_list: + raise ExtractorError('Video is offline', expected=True) formats = [] for stream_info in stream_info_list: stream_url = stream_info.get('sFlvUrl') From d81ba7d491bf2c89246d8817438db48a5a4e4ae9 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 30 Aug 2022 17:23:59 +0530 Subject: [PATCH 097/284] [jsinterp, extractor/youtube] Minor fixes --- test/test_jsinterp.py | 5 +++++ yt_dlp/cache.py | 10 +++++----- yt_dlp/extractor/openload.py | 7 ++++--- yt_dlp/extractor/youtube.py | 5 +++-- yt_dlp/jsinterp.py | 17 +++++++++++++---- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 863e52458b..778607fb25 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -129,6 +129,11 @@ def test_precedence(self): self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) def test_builtins(self): + jsi = JSInterpreter(''' + function x() { return NaN } + ''') + self.assertTrue(math.isnan(jsi.call_function('x'))) + jsi = JSInterpreter(''' function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } ''') diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index 602cb9edba..4f9fb78d37 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -51,15 +51,15 @@ def store(self, section, key, data, dtype='json'): tb = traceback.format_exc() self._ydl.report_warning(f'Writing cache to {fn!r} failed: {tb}') - def _validate(self, data, after): + def _validate(self, data, min_ver): version = traverse_obj(data, 'yt-dlp_version') if not version: # Backward compatibility data, version = {'data': data}, '2022.08.19' - if not after or version_tuple(version) > version_tuple(after): + if not min_ver or version_tuple(version) >= version_tuple(min_ver): return data['data'] - self._ydl.write_debug(f'Discarding old cache from version {version} (need {after})') + self._ydl.write_debug(f'Discarding old cache from version {version} (needs {min_ver})') - def load(self, section, key, dtype='json', default=None, *, after=None): + def load(self, section, key, dtype='json', default=None, *, min_ver=None): assert dtype in ('json',) if not self.enabled: @@ -70,7 +70,7 @@ def load(self, section, key, dtype='json', default=None, *, after=None): try: with open(cache_fn, encoding='utf-8') as cachef: self._ydl.write_debug(f'Loading {section}.{key} from cache') - return self._validate(json.load(cachef), after) + return self._validate(json.load(cachef), min_ver) except (ValueError, KeyError): try: file_size = os.path.getsize(cache_fn) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index 4bba7bdd05..d2756a0061 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -52,6 +52,8 @@ class PhantomJSwrapper: This class is experimental. """ + INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html' + _BASE_JS = R''' phantom.onError = function(msg, trace) {{ var msgStack = ['PHANTOM ERROR: ' + msg]; @@ -110,8 +112,7 @@ def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: - raise ExtractorError( - 'PhantomJS not found, Please download it from https://phantomjs.org/download.html', expected=True) + raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) self.extractor = extractor @@ -237,6 +238,6 @@ def execute(self, jscode, video_id=None, *, note='Executing JS'): except Exception as e: raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) if returncode: - raise ExtractorError(f'{note} failed:\n{stderr.strip()}') + raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') return stdout diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b30dadf9f0..0498f980d2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2670,7 +2670,7 @@ def _extract_n_function_name(self, jscode): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, after='2022.08.19.1') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.08.19.2') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -3282,7 +3282,8 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i except ExtractorError as e: phantomjs_hint = '' if isinstance(e, JSInterpreter.Exception): - phantomjs_hint = f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} to workaround the issue\n' + phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' + f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') self.report_warning( f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index cadb013a31..99bdca9270 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -172,7 +172,14 @@ def wrap_interpreter(cls, f): def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs): if cls.ENABLED and stmt.strip(): cls.write(stmt, level=allow_recursion) - ret, should_ret = f(self, stmt, local_vars, allow_recursion, *args, **kwargs) + try: + ret, should_ret = f(self, stmt, local_vars, allow_recursion, *args, **kwargs) + except Exception as e: + if cls.ENABLED: + if isinstance(e, ExtractorError): + e = e.orig_msg + cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion) + raise if cls.ENABLED and stmt.strip(): cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) return ret, should_ret @@ -226,7 +233,7 @@ def _regex_flags(cls, expr): @staticmethod def _separate(expr, delim=',', max_split=None): - OP_CHARS = '+-*/%&|^=<>!,;{}()[]:' + OP_CHARS = '+-*/%&|^=<>!,;{}:' if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} @@ -504,7 +511,7 @@ def dict_item(key, val): (?P<op>{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?!=)(?P<expr>.*)$ )|(?P<return> - (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ + (?!if|return|true|false|null|undefined|NaN)(?P<name>{_NAME_RE})$ )|(?P<indexing> (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ )|(?P<attribute> @@ -539,6 +546,8 @@ def dict_item(key, val): raise JS_Continue() elif expr == 'undefined': return JS_Undefined, should_return + elif expr == 'NaN': + return float('NaN'), should_return elif m and m.group('return'): return local_vars.get(m.group('name'), JS_Undefined), should_return @@ -784,7 +793,7 @@ def resf(args, kwargs={}, allow_recursion=100): global_stack[0].update(itertools.zip_longest(argnames, args, fillvalue=None)) global_stack[0].update(kwargs) var_stack = LocalNameSpace(*global_stack) - ret, should_abort = self.interpret_statement(code.replace('\n', ''), var_stack, allow_recursion - 1) + ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1) if should_abort: return ret return resf From e1eabd7beb4cc83338a7422546ae1c9ae8b2097f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 30 Aug 2022 18:10:48 +0530 Subject: [PATCH 098/284] [downloader/external] Smarter detection of executable Closes #4778 --- yt_dlp/downloader/external.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 9859a7b333..d117c06e0a 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -515,16 +515,14 @@ class AVconvFD(FFmpegFD): if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') } -_BY_EXE = {klass.EXE_NAME: klass for klass in _BY_NAME.values()} - def list_external_downloaders(): return sorted(_BY_NAME.keys()) def get_external_downloader(external_downloader): - """ Given the name of the executable, see whether we support the given - downloader . """ - # Drop .exe extension on Windows + """ Given the name of the executable, see whether we support the given downloader """ bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME.get(bn, _BY_EXE.get(bn)) + return _BY_NAME.get(bn) or next(( + klass for klass in _BY_NAME.values() if klass.EXE_NAME in bn + ), None) From da4db748fa813a8de684d5ab699b8f561b982e35 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 30 Aug 2022 20:58:28 +0530 Subject: [PATCH 099/284] [utils] Add `deprecation_warning` See https://github.com/yt-dlp/yt-dlp/pull/2173#issuecomment-1097021515 --- yt_dlp/YoutubeDL.py | 20 +++++++++++++------- yt_dlp/__init__.py | 2 ++ yt_dlp/__main__.py | 1 + yt_dlp/downloader/common.py | 1 + yt_dlp/downloader/fragment.py | 4 ++-- yt_dlp/extractor/common.py | 10 ++++------ yt_dlp/extractor/youtube.py | 4 ++-- yt_dlp/options.py | 6 +++--- yt_dlp/postprocessor/common.py | 12 ++++++++---- yt_dlp/postprocessor/ffmpeg.py | 8 ++++---- yt_dlp/update.py | 8 +++----- yt_dlp/utils.py | 31 ++++++++++++++++++++++++------- 12 files changed, 67 insertions(+), 40 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4330006ccb..491e02dec6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -90,6 +90,7 @@ args_to_str, bug_reports_message, date_from_str, + deprecation_warning, determine_ext, determine_protocol, encode_compat_str, @@ -631,7 +632,7 @@ def check_deprecated(param, option, suggestion): for msg in self.params.get('_warnings', []): self.report_warning(msg) for msg in self.params.get('_deprecation_warnings', []): - self.deprecation_warning(msg) + self.deprecated_feature(msg) self.params['compat_opts'] = set(self.params.get('compat_opts', ())) if 'list-formats' in self.params['compat_opts']: @@ -835,9 +836,11 @@ def _write_string(self, message, out=None, only_once=False): def to_stdout(self, message, skip_eol=False, quiet=None): """Print message to stdout""" if quiet is not None: - self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead') + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. ' + 'Use "YoutubeDL.to_screen" instead') if skip_eol is not False: - self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. Use "YoutubeDL.to_screen" instead') + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. ' + 'Use "YoutubeDL.to_screen" instead') self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out) def to_screen(self, message, skip_eol=False, quiet=None): @@ -973,11 +976,14 @@ def report_warning(self, message, only_once=False): return self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) - def deprecation_warning(self, message): + def deprecation_warning(self, message, *, stacklevel=0): + deprecation_warning( + message, stacklevel=stacklevel + 1, printer=self.report_error, is_error=False) + + def deprecated_feature(self, message): if self.params.get('logger') is not None: - self.params['logger'].warning(f'DeprecationWarning: {message}') - else: - self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) + self.params['logger'].warning(f'Deprecated Feature: {message}') + self.to_stderr(f'{self._format_err("Deprecated Feature:", self.Styles.ERROR)} {message}', True) def report_error(self, message, *args, **kwargs): ''' diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index e9234e6f49..3dc9b6e569 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -63,6 +63,8 @@ ) from .YoutubeDL import YoutubeDL +_IN_CLI = False + def _exit(status=0, *args): for msg in args: diff --git a/yt_dlp/__main__.py b/yt_dlp/__main__.py index ff5d71d3c9..895918c272 100644 --- a/yt_dlp/__main__.py +++ b/yt_dlp/__main__.py @@ -14,4 +14,5 @@ import yt_dlp if __name__ == '__main__': + yt_dlp._IN_CLI = True yt_dlp.main() diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 4962c0cf8b..9ade4269e8 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -92,6 +92,7 @@ def _set_ydl(self, ydl): for func in ( 'deprecation_warning', + 'deprecated_feature', 'report_error', 'report_file_already_downloaded', 'report_warning', diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index b1d3127c32..a5d70d0d49 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -65,8 +65,8 @@ class FragmentFD(FileDownloader): """ def report_retry_fragment(self, err, frag_index, count, retries): - self.deprecation_warning( - 'yt_dlp.downloader.FragmentFD.report_retry_fragment is deprecated. Use yt_dlp.downloader.FileDownloader.report_retry instead') + self.deprecation_warning('yt_dlp.downloader.FragmentFD.report_retry_fragment is deprecated. ' + 'Use yt_dlp.downloader.FileDownloader.report_retry instead') return self.report_retry(err, count, retries, frag_index) def report_skip_fragment(self, frag_index, err=None): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 6337a13a44..f950d28ed3 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1766,9 +1766,8 @@ def _get_field_setting(self, field, key): if field not in self.settings: if key in ('forced', 'priority'): return False - self.ydl.deprecation_warning( - f'Using arbitrary fields ({field}) for format sorting is deprecated ' - 'and may be removed in a future version') + self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is ' + 'deprecated and may be removed in a future version') self.settings[field] = {} propObj = self.settings[field] if key not in propObj: @@ -1853,9 +1852,8 @@ def add_item(field, reverse, closest, limit_text): if self._get_field_setting(field, 'type') == 'alias': alias, field = field, self._get_field_setting(field, 'field') if self._get_field_setting(alias, 'deprecated'): - self.ydl.deprecation_warning( - f'Format sorting alias {alias} is deprecated ' - f'and may be removed in a future version. Please use {field} instead') + self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may ' + 'be removed in a future version. Please use {field} instead') reverse = match.group('reverse') is not None closest = match.group('separator') == '~' limit_text = match.group('limit') diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 0498f980d2..ee9cce16e7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2959,8 +2959,8 @@ def extract_thread(contents): # YouTube comments have a max depth of 2 max_depth = int_or_none(get_single_config_arg('max_comment_depth')) if max_depth: - self._downloader.deprecation_warning( - '[youtube] max_comment_depth extractor argument is deprecated. Set max replies in the max-comments extractor argument instead.') + self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. ' + 'Set max replies in the max-comments extractor argument instead') if max_depth == 1 and parent: return diff --git a/yt_dlp/options.py b/yt_dlp/options.py index a0db9bc028..e667384481 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -25,6 +25,7 @@ OUTTMPL_TYPES, POSTPROCESS_WHEN, Config, + deprecation_warning, expand_path, format_field, get_executable_path, @@ -1864,7 +1865,6 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): def _hide_login_info(opts): - write_string( - 'DeprecationWarning: "yt_dlp.options._hide_login_info" is deprecated and may be removed in a future version. ' - 'Use "yt_dlp.utils.Config.hide_login_info" instead\n') + deprecation_warning(f'"{__name__}._hide_login_info" is deprecated and may be removed ' + 'in a future version. Use "yt_dlp.utils.Config.hide_login_info" instead') return Config.hide_login_info(opts) diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index 20d890df03..44feda4278 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -7,10 +7,10 @@ PostProcessingError, RetryManager, _configuration_args, + deprecation_warning, encodeFilename, network_exceptions, sanitized_Request, - write_string, ) @@ -73,10 +73,14 @@ def report_warning(self, text, *args, **kwargs): if self._downloader: return self._downloader.report_warning(text, *args, **kwargs) - def deprecation_warning(self, text): + def deprecation_warning(self, msg): + warn = getattr(self._downloader, 'deprecation_warning', deprecation_warning) + return warn(msg, stacklevel=1) + + def deprecated_feature(self, msg): if self._downloader: - return self._downloader.deprecation_warning(text) - write_string(f'DeprecationWarning: {text}') + return self._downloader.deprecated_feature(msg) + return deprecation_warning(msg, stacklevel=1) def report_error(self, text, *args, **kwargs): self.deprecation_warning('"yt_dlp.postprocessor.PostProcessor.report_error" is deprecated. ' diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index a1f367ae42..76f9d29c5e 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -15,6 +15,7 @@ Popen, PostProcessingError, _get_exe_version_output, + deprecation_warning, detect_exe_version, determine_ext, dfxp2srt, @@ -30,7 +31,6 @@ traverse_obj, variadic, write_json_file, - write_string, ) EXT_TO_OUT_FORMATS = { @@ -187,8 +187,8 @@ def _get_version(self, kind): else: self.probe_basename = basename if basename == self._ffmpeg_to_avconv[kind]: - self.deprecation_warning( - f'Support for {self._ffmpeg_to_avconv[kind]} is deprecated and may be removed in a future version. Use {kind} instead') + self.deprecated_feature(f'Support for {self._ffmpeg_to_avconv[kind]} is deprecated and ' + f'may be removed in a future version. Use {kind} instead') return version @functools.cached_property @@ -1064,7 +1064,7 @@ def __init__(self, downloader=None, format=None): @classmethod def is_webp(cls, path): - write_string(f'DeprecationWarning: {cls.__module__}.{cls.__name__}.is_webp is deprecated') + deprecation_warning(f'{cls.__module__}.{cls.__name__}.is_webp is deprecated') return imghdr.what(path) == 'webp' def fixup_webp(self, info, idx=-1): diff --git a/yt_dlp/update.py b/yt_dlp/update.py index e82cdf451a..026bc12aa4 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -14,6 +14,7 @@ from .utils import ( Popen, cached_method, + deprecation_warning, shell_quote, system_identifier, traverse_obj, @@ -302,11 +303,8 @@ def run_update(ydl): def update_self(to_screen, verbose, opener): import traceback - from .utils import write_string - - write_string( - 'DeprecationWarning: "yt_dlp.update.update_self" is deprecated and may be removed in a future version. ' - 'Use "yt_dlp.update.run_update(ydl)" instead\n') + deprecation_warning(f'"{__name__}.update_self" is deprecated and may be removed ' + f'in a future version. Use "{__name__}.run_update(ydl)" instead') printfn = to_screen diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 957c7eaa79..da2d042cb3 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -828,8 +828,8 @@ def escapeHTML(text): def process_communicate_or_kill(p, *args, **kwargs): - write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated ' - 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead') + deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed ' + f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead') return Popen.communicate_or_kill(p, *args, **kwargs) @@ -1934,7 +1934,7 @@ def __eq__(self, other): def platform_name(): """ Returns the platform name as a str """ - write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead') + deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead') return platform.platform() @@ -1980,6 +1980,23 @@ def write_string(s, out=None, encoding=None): out.flush() +def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): + from . import _IN_CLI + if _IN_CLI: + if msg in deprecation_warning._cache: + return + deprecation_warning._cache.add(msg) + if printer: + return printer(f'{msg}{bug_reports_message()}', **kwargs) + return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs) + else: + import warnings + warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3) + + +deprecation_warning._cache = set() + + def bytes_to_intlist(bs): if not bs: return [] @@ -4862,8 +4879,8 @@ def decode_base_n(string, n=None, table=None): def decode_base(value, digits): - write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated ' - 'and may be removed in a future version. Use yt_dlp.decode_base_n instead') + deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed ' + f'in a future version. Use {__name__}.decode_base_n instead') return decode_base_n(value, table=digits) @@ -5332,8 +5349,8 @@ def _traverse_obj(obj, path, _current_depth=0): def traverse_dict(dictn, keys, casesense=True): - write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated ' - 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead') + deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed ' + f'in a future version. Use "{__name__}.traverse_obj" instead') return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) From 82ea226c61880c9118cce32681e54be24839519a Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Wed, 31 Aug 2022 01:24:14 +0900 Subject: [PATCH 100/284] Restore LD_LIBRARY_PATH when using PyInstaller (#4666) Authored by: Lesmiscore --- yt_dlp/utils.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index da2d042cb3..00f2fbf423 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -840,12 +840,35 @@ class Popen(subprocess.Popen): else: _startupinfo = None - def __init__(self, *args, text=False, **kwargs): + @staticmethod + def _fix_pyinstaller_ld_path(env): + """Restore LD_LIBRARY_PATH when using PyInstaller + Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations + https://github.com/yt-dlp/yt-dlp/issues/4573 + """ + if not hasattr(sys, '_MEIPASS'): + return + + def _fix(key): + orig = env.get(f'{key}_ORIG') + if orig is None: + env.pop(key, None) + else: + env[key] = orig + + _fix('LD_LIBRARY_PATH') # Linux + _fix('DYLD_LIBRARY_PATH') # macOS + + def __init__(self, *args, env=None, text=False, **kwargs): + if env is None: + env = os.environ.copy() + self._fix_pyinstaller_ld_path(env) + if text is True: kwargs['universal_newlines'] = True # For 3.6 compatibility kwargs.setdefault('encoding', 'utf-8') kwargs.setdefault('errors', 'replace') - super().__init__(*args, **kwargs, startupinfo=self._startupinfo) + super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo) def communicate_or_kill(self, *args, **kwargs): try: From 459262ac97c039a426f51f3fb3a5d780de5b9dca Mon Sep 17 00:00:00 2001 From: Jeff Huffman <tejing@tejing.com> Date: Tue, 30 Aug 2022 12:34:13 -0400 Subject: [PATCH 101/284] [extractor/crunchyroll:beta] Use anonymous access (#4704) Closes #4692 Authored by: tejing1 --- yt_dlp/extractor/crunchyroll.py | 36 +++++++++------------------------ 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index d4968c13b2..141d8c5a7c 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -720,15 +720,20 @@ class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): def _get_params(self, lang): if not CrunchyrollBetaBaseIE.params: + if self._get_cookies(f'https://beta.crunchyroll.com/{lang}').get('etp_rt'): + grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' + else: + grant_type, key = 'client_id', 'anonClientId' + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) api_domain = app_config['cxApiParams']['apiDomain'] - basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') + auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', headers={ - 'Authorization': 'Basic ' + basic_token - }, data='grant_type=etp_rt_cookie'.encode('ascii')) + 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') + }, data=f'grant_type={grant_type}'.encode('ascii')) policy_response = self._download_json( f'{api_domain}/index/v2', None, note='Retrieving signed policy', headers={ @@ -747,21 +752,6 @@ def _get_params(self, lang): CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) return CrunchyrollBetaBaseIE.params - def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) - content_data = initial_state['content']['byId'][internal_id] - if is_episode: - video_id = content_data['external_id'].split('.')[1] - series_id = content_data['episode_metadata']['series_slug_title'] - else: - series_id = content_data['slug_title'] - series_id = re.sub(r'-{2,}', '-', series_id) - url = f'https://www.crunchyroll.com/{lang}{series_id}' - if is_episode: - url = url + f'/{display_id}-{video_id}' - self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') - return self.url_result(url, iekey, display_id) - class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:beta' @@ -800,10 +790,6 @@ class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) episode_response = self._download_json( @@ -897,10 +883,6 @@ class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) series_response = self._download_json( From 9bd13fe5bbe1df6bb01d4edb68f2c63a4812bf94 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 30 Aug 2022 16:54:46 +0000 Subject: [PATCH 102/284] [cookies] Support firefox container in `--cookies-from-browser` (#4753) Authored by: bashonly --- README.md | 11 ++++++----- yt_dlp/YoutubeDL.py | 5 +++-- yt_dlp/__init__.py | 6 +++++- yt_dlp/cookies.py | 45 ++++++++++++++++++++++++++++++++++++--------- yt_dlp/options.py | 8 ++++---- 5 files changed, 54 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 8957711ddb..c101048d5a 100644 --- a/README.md +++ b/README.md @@ -706,13 +706,14 @@ ## Filesystem Options: and dump cookie jar in --no-cookies Do not read/dump cookies from/to file (default) - --cookies-from-browser BROWSER[+KEYRING][:PROFILE] + --cookies-from-browser BROWSER[+KEYRING][:PROFILE[:CONTAINER]] The name of the browser and (optionally) the name/path of the profile to load cookies - from, separated by a ":". Currently - supported browsers are: brave, chrome, - chromium, edge, firefox, opera, safari, - vivaldi. By default, the most recently + from (and container name if Firefox) + separated by a ":". Currently supported + browsers are: brave, chrome, chromium, edge, + firefox, opera, safari, vivaldi. By default, + the default container of the most recently accessed profile is used. The keyring used for decrypting Chromium cookies on Linux can be (optionally) specified after the browser diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 491e02dec6..10c17ea007 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -304,8 +304,9 @@ class YoutubeDL: should act on each input URL as opposed to for the entire queue cookiefile: File name or text stream from where cookies should be read and dumped to cookiesfrombrowser: A tuple containing the name of the browser, the profile - name/path from where cookies are loaded, and the name of the - keyring, e.g. ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + name/path from where cookies are loaded, the name of the keyring, + and the container name, e.g. ('chrome', ) or + ('vivaldi', 'default', 'BASICTEXT') or ('firefox', 'default', None, 'Meta') legacyserverconnect: Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation nocheckcertificate: Do not verify SSL certificates diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3dc9b6e569..f4a2086ce2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -346,6 +346,7 @@ def parse_chapters(name, value): # Cookies from browser if opts.cookiesfrombrowser: + container = None mobj = re.match(r'(?P<name>[^+:]+)(\s*\+\s*(?P<keyring>[^:]+))?(\s*:(?P<profile>.+))?', opts.cookiesfrombrowser) if mobj is None: raise ValueError(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}') @@ -354,12 +355,15 @@ def parse_chapters(name, value): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser specified for cookies: "{browser_name}". ' f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}') + elif profile and browser_name == 'firefox': + if ':' in profile and not os.path.exists(profile): + profile, container = profile.split(':', 1) if keyring is not None: keyring = keyring.upper() if keyring not in SUPPORTED_KEYRINGS: raise ValueError(f'unsupported keyring specified for cookies: "{keyring}". ' f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') - opts.cookiesfrombrowser = (browser_name, profile, keyring) + opts.cookiesfrombrowser = (browser_name, profile, keyring, container) # MetadataParser def metadataparser_actions(f): diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 1a164bb31a..c5fb5ab68c 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -3,6 +3,7 @@ import http.cookiejar import json import os +import re import shutil import struct import subprocess @@ -24,7 +25,7 @@ sqlite3, ) from .minicurses import MultilinePrinter, QuietMultilinePrinter -from .utils import Popen, YoutubeDLCookieJar, error_to_str, expand_path +from .utils import Popen, YoutubeDLCookieJar, error_to_str, expand_path, try_call CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -85,8 +86,9 @@ def _create_progress_bar(logger): def load_cookies(cookie_file, browser_specification, ydl): cookie_jars = [] if browser_specification is not None: - browser_name, profile, keyring = _parse_browser_specification(*browser_specification) - cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring)) + browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) + cookie_jars.append( + extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) if cookie_file is not None: is_filename = YoutubeDLCookieJar.is_path(cookie_file) @@ -101,9 +103,9 @@ def load_cookies(cookie_file, browser_specification, ydl): return _merge_cookie_jars(cookie_jars) -def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None): +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None): if browser_name == 'firefox': - return _extract_firefox_cookies(profile, logger) + return _extract_firefox_cookies(profile, container, logger) elif browser_name == 'safari': return _extract_safari_cookies(profile, logger) elif browser_name in CHROMIUM_BASED_BROWSERS: @@ -112,7 +114,7 @@ def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), raise ValueError(f'unknown browser: {browser_name}') -def _extract_firefox_cookies(profile, logger): +def _extract_firefox_cookies(profile, container, logger): logger.info('Extracting cookies from firefox') if not sqlite3: logger.warning('Cannot extract cookies from firefox without sqlite3 support. ' @@ -126,6 +128,20 @@ def _extract_firefox_cookies(profile, logger): else: search_root = os.path.join(_firefox_browser_dir(), profile) + container_id = None + if container is not None: + containers_path = os.path.join(search_root, 'containers.json') + if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): + raise FileNotFoundError(f'could not read containers.json in {search_root}') + with open(containers_path, 'r') as containers: + identities = json.load(containers).get('identities', []) + container_id = next((context.get('userContextId') for context in identities if container in ( + context.get('name'), + try_call(lambda: re.fullmatch(r'userContext([^\.]+)\.label', context['l10nID']).group()) + )), None) + if not isinstance(container_id, int): + raise ValueError(f'could not find firefox container "{container}" in containers.json') + cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger) if cookie_database_path is None: raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') @@ -135,7 +151,18 @@ def _extract_firefox_cookies(profile, logger): cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) - cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') + origin_attributes = '' + if isinstance(container_id, int): + origin_attributes = f'^userContextId={container_id}' + logger.debug( + f'Only loading cookies from firefox container "{container}", ID {container_id}') + try: + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE originAttributes=?', + (origin_attributes, )) + except sqlite3.OperationalError: + logger.debug('Database exception, loading all cookies') + cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') jar = YoutubeDLCookieJar() with _create_progress_bar(logger) as progress_bar: table = cursor.fetchall() @@ -948,11 +975,11 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser_name, profile=None, keyring=None): +def _parse_browser_specification(browser_name, profile=None, keyring=None, container=None): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser: "{browser_name}"') if keyring not in (None, *SUPPORTED_KEYRINGS): raise ValueError(f'unsupported keyring: "{keyring}"') if profile is not None and _is_path(profile): profile = os.path.expanduser(profile) - return browser_name, profile, keyring + return browser_name, profile, keyring, container diff --git a/yt_dlp/options.py b/yt_dlp/options.py index e667384481..e50ecc579c 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1400,12 +1400,12 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not read/dump cookies from/to file (default)') filesystem.add_option( '--cookies-from-browser', - dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE]', + dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE[:CONTAINER]]', help=( - 'The name of the browser and (optionally) the name/path of ' - 'the profile to load cookies from, separated by a ":". ' + 'The name of the browser and (optionally) the name/path of the profile to load cookies from ' + '(and container name if Firefox) separated by a ":". ' f'Currently supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}. ' - 'By default, the most recently accessed profile is used. ' + 'By default, the default container of the most recently accessed profile is used. ' 'The keyring used for decrypting Chromium cookies on Linux can be ' '(optionally) specified after the browser name separated by a "+". ' f'Currently supported keyrings are: {", ".join(map(str.lower, sorted(SUPPORTED_KEYRINGS)))}')) From bfbecd1174a9e2ee08117352c26e664d36f1cc17 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Wed, 31 Aug 2022 02:07:55 +0900 Subject: [PATCH 103/284] [extractor/newspicks] Add extractor (#4725) Authored by: Lesmiscore --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/common.py | 4 +-- yt_dlp/extractor/newspicks.py | 54 +++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/newspicks.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 60e1b716f1..1cded3ddf8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1083,6 +1083,7 @@ NewgroundsPlaylistIE, NewgroundsUserIE, ) +from .newspicks import NewsPicksIE from .newstube import NewstubeIE from .newsy import NewsyIE from .nextmedia import ( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f950d28ed3..b792219553 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3260,7 +3260,7 @@ def _media_formats(src, cur_media_type, type_info=None): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = strip_or_none(media_attributes.get('src')) + src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source'))) if src: f = parse_content_type(media_attributes.get('type')) _, formats = _media_formats(src, media_type, f) @@ -3271,7 +3271,7 @@ def _media_formats(src, cur_media_type, type_info=None): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source'))) if not src: continue f = parse_content_type(s_attr.get('type')) diff --git a/yt_dlp/extractor/newspicks.py b/yt_dlp/extractor/newspicks.py new file mode 100644 index 0000000000..0232d53570 --- /dev/null +++ b/yt_dlp/extractor/newspicks.py @@ -0,0 +1,54 @@ +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class NewsPicksIE(InfoExtractor): + _VALID_URL = r'https://newspicks.com/movie-series/(?P<channel_id>\d+)\?movieId=(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://newspicks.com/movie-series/11?movieId=1813', + 'info_dict': { + 'id': '1813', + 'title': '日本の課題を破壊せよ【ゲスト:成田悠輔】', + 'description': 'md5:09397aad46d6ded6487ff13f138acadf', + 'channel': 'HORIE ONE', + 'channel_id': '11', + 'release_date': '20220117', + 'thumbnail': r're:https://.+jpg', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id, channel_id = self._match_valid_url(url).group('id', 'channel_id') + webpage = self._download_webpage(url, video_id) + entries = self._parse_html5_media_entries( + url, webpage.replace('movie-for-pc', 'movie'), video_id, 'hls') + if not entries: + raise ExtractorError('No HTML5 media elements found') + info = entries[0] + self._sort_formats(info['formats']) + + title = self._html_search_meta('og:title', webpage, fatal=False) + description = self._html_search_meta( + ('og:description', 'twitter:title'), webpage, fatal=False) + channel = self._html_search_regex( + r'value="11".+?<div\s+class="title">(.+?)</div', webpage, 'channel name', fatal=False) + if not title or not channel: + title, channel = re.split(r'\s*|\s*', self._html_extract_title(webpage)) + + release_date = self._search_regex( + r'<span\s+class="on-air-date">\s*(\d+)年(\d+)月(\d+)日\s*</span>', + webpage, 'release date', fatal=False, group=(1, 2, 3)) + + info.update({ + 'id': video_id, + 'title': title, + 'description': description, + 'channel': channel, + 'channel_id': channel_id, + 'release_date': ('%04d%02d%02d' % tuple(map(int, release_date))) if release_date else None, + }) + return info From f26af78a8ac11d9d617ed31ea5282cfaa5bcbcfa Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Fri, 19 Aug 2022 00:30:04 -0500 Subject: [PATCH 104/284] [jsinterp] Add `charcodeAt` and bitwise overflow (#4706) Authored by: elyse0 --- test/test_jsinterp.py | 16 ++++++++++++++++ yt_dlp/jsinterp.py | 14 +++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 778607fb25..4b6e22bac2 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -352,6 +352,22 @@ def test_regex(self): ''') self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + def test_char_code_at(self): + jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') + self.assertEqual(jsi.call_function('x', 0), 116) + self.assertEqual(jsi.call_function('x', 1), 101) + self.assertEqual(jsi.call_function('x', 2), 115) + self.assertEqual(jsi.call_function('x', 3), 116) + self.assertEqual(jsi.call_function('x', 4), None) + self.assertEqual(jsi.call_function('x', 'not_a_number'), 116) + + def test_bitwise_operators_overflow(self): + jsi = JSInterpreter('function x(){return -524999584 << 5}') + self.assertEqual(jsi.call_function('x'), 379882496) + + jsi = JSInterpreter('function x(){return 1236566549 << 5}') + self.assertEqual(jsi.call_function('x'), 915423904) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 99bdca9270..51c7beed43 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -18,10 +18,11 @@ def _js_bit_op(op): + def zeroise(x): + return 0 if x in (None, JS_Undefined) else x + def wrapped(a, b): - def zeroise(x): - return 0 if x in (None, JS_Undefined) else x - return op(zeroise(a), zeroise(b)) + return op(zeroise(a), zeroise(b)) & 0xffffffff return wrapped @@ -692,6 +693,13 @@ def eval_method(): return obj.index(idx, start) except ValueError: return -1 + elif member == 'charCodeAt': + assertion(isinstance(obj, str), 'must be applied on a string') + assertion(len(argvals) == 1, 'takes exactly one argument') + idx = argvals[0] if isinstance(argvals[0], int) else 0 + if idx >= len(obj): + return None + return ord(obj[idx]) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) From 76f2bb175d56a8d85001da2b4ee18d790e0948ad Mon Sep 17 00:00:00 2001 From: DepFA <35278260+dfaker@users.noreply.github.com> Date: Wed, 31 Aug 2022 16:40:59 +0100 Subject: [PATCH 105/284] [extractor/stripchat] Don't modify input URL (#4781) Authored by: dfaker --- yt_dlp/extractor/stripchat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/yt_dlp/extractor/stripchat.py b/yt_dlp/extractor/stripchat.py index 7214184bfc..2e84729bd2 100644 --- a/yt_dlp/extractor/stripchat.py +++ b/yt_dlp/extractor/stripchat.py @@ -29,9 +29,7 @@ class StripchatIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://stripchat.com/%s/' % video_id, video_id, - headers=self.geo_verification_headers()) + webpage = self._download_webpage(url, video_id, headers=self.geo_verification_headers()) data = self._parse_json( self._search_regex( From f8c7ba99845c6d426d32e7f1218a6ecfc8132f45 Mon Sep 17 00:00:00 2001 From: Tejas Arlimatti <tejasarlimatti@gmail.com> Date: Wed, 31 Aug 2022 22:16:26 +0530 Subject: [PATCH 106/284] [extractor/epoch] Add extractor (#4772) Closes #4714 Authored by: tejasa97 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/epoch.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 yt_dlp/extractor/epoch.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1cded3ddf8..57abb345ac 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -470,6 +470,7 @@ EpiconIE, EpiconSeriesIE, ) +from .epoch import EpochIE from .eporner import EpornerIE from .eroprofile import ( EroProfileIE, diff --git a/yt_dlp/extractor/epoch.py b/yt_dlp/extractor/epoch.py new file mode 100644 index 0000000000..13eeabe3e4 --- /dev/null +++ b/yt_dlp/extractor/epoch.py @@ -0,0 +1,46 @@ +from .common import InfoExtractor + + +class EpochIE(InfoExtractor): + _VALID_URL = r'https?://www.theepochtimes\.com/[\w-]+_(?P<id>\d+).html' + _TESTS = [ + { + 'url': 'https://www.theepochtimes.com/they-can-do-audio-video-physical-surveillance-on-you-24h-365d-a-year-rex-lee-on-intrusive-apps_4661688.html', + 'info_dict': { + 'id': 'a3dd732c-4750-4bc8-8156-69180668bda1', + 'ext': 'mp4', + 'title': '‘They Can Do Audio, Video, Physical Surveillance on You 24H/365D a Year’: Rex Lee on Intrusive Apps', + } + }, + { + 'url': 'https://www.theepochtimes.com/the-communist-partys-cyberattacks-on-america-explained-rex-lee-talks-tech-hybrid-warfare_4342413.html', + 'info_dict': { + 'id': '276c7f46-3bbf-475d-9934-b9bbe827cf0a', + 'ext': 'mp4', + 'title': 'The Communist Party’s Cyberattacks on America Explained; Rex Lee Talks Tech Hybrid Warfare', + } + }, + { + 'url': 'https://www.theepochtimes.com/kash-patel-a-6-year-saga-of-government-corruption-from-russiagate-to-mar-a-lago_4690250.html', + 'info_dict': { + 'id': 'aa9ceecd-a127-453d-a2de-7153d6fd69b6', + 'ext': 'mp4', + 'title': 'Kash Patel: A ‘6-Year-Saga’ of Government Corruption, From Russiagate to Mar-a-Lago', + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + youmaker_video_id = self._search_regex(r'data-trailer="[\w-]+" data-id="([\w-]+)"', webpage, 'url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'http://vs1.youmaker.com/assets/{youmaker_video_id}/playlist.m3u8', video_id, 'mp4', m3u8_id='hls') + + return { + 'id': youmaker_video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': self._html_extract_title(webpage) + } From b86ca447ce0dc7b41e5314a7bb566cfa4d5a3660 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 31 Aug 2022 22:24:31 +0530 Subject: [PATCH 107/284] [extractor/mediaset] Fix embed extraction Closes #4804 --- yt_dlp/extractor/mediaset.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 0671c29a66..ebe894f740 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -172,31 +172,27 @@ class MediasetIE(ThePlatformBaseIE): }] def _extract_from_webpage(self, url, webpage): - def _qs(url): - return parse_qs(url) - def _program_guid(qs): return qs.get('programGuid', [None])[0] - entries = [] for mobj in re.finditer( r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', webpage): embed_url = mobj.group('url') - embed_qs = _qs(embed_url) + embed_qs = parse_qs(embed_url) program_guid = _program_guid(embed_qs) if program_guid: - entries.append(embed_url) + yield self.url_result(embed_url) continue + video_id = embed_qs.get('id', [None])[0] if not video_id: continue urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect') embed_url = urlh.geturl() - program_guid = _program_guid(_qs(embed_url)) + program_guid = _program_guid(parse_qs(embed_url)) if program_guid: - entries.append(embed_url) - return entries + yield self.url_result(embed_url) def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): From 11734714c2166a26f0de0c02ff1a0e736d15210f Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Thu, 1 Sep 2022 02:02:33 +0900 Subject: [PATCH 108/284] [extractor/eurosport] Add extractor (#4613) Closes #2487 Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/eurosport.py | 99 +++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 yt_dlp/extractor/eurosport.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 57abb345ac..4c033e5c0e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -492,6 +492,7 @@ from .esri import EsriVideoIE from .europa import EuropaIE from .europeantour import EuropeanTourIE +from .eurosport import EurosportIE from .euscreen import EUScreenIE from .expotv import ExpoTVIE from .expressen import ExpressenIE diff --git a/yt_dlp/extractor/eurosport.py b/yt_dlp/extractor/eurosport.py new file mode 100644 index 0000000000..5681499fb3 --- /dev/null +++ b/yt_dlp/extractor/eurosport.py @@ -0,0 +1,99 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class EurosportIE(InfoExtractor): + _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)' + _TESTS = [{ + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', + 'info_dict': { + 'id': '2480939', + 'ext': 'mp4', + 'title': 'Highlights: Rafael Nadal brushes aside Caper Ruud to win record-extending 14th French Open title', + 'description': 'md5:b564db73ecfe4b14ebbd8e62a3692c76', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388285-69245968-2560-1440.png', + 'duration': 195.0, + 'display_id': 'vid1694147', + 'timestamp': 1654446698, + 'upload_date': '20220605', + } + }, { + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/watch-the-top-five-shots-from-men-s-final-as-rafael-nadal-beats-casper-ruud-to-seal-14th-french-open_vid1694283/video.shtml', + 'info_dict': { + 'id': '2481254', + 'ext': 'mp4', + 'title': 'md5:149dcc5dfb38ab7352acc008cc9fb071', + 'duration': 130.0, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388422-69248708-2560-1440.png', + 'description': 'md5:a0c8a7f6b285e48ae8ddbe7aa85cfee6', + 'display_id': 'vid1694283', + 'timestamp': 1654456090, + 'upload_date': '20220605', + } + }, { + # geo-fence but can bypassed by xff + 'url': 'https://www.eurosport.com/cycling/tour-de-france-femmes/2022/incredible-ride-marlen-reusser-storms-to-stage-4-win-at-tour-de-france-femmes_vid1722221/video.shtml', + 'info_dict': { + 'id': '2582552', + 'ext': 'mp4', + 'title': '‘Incredible ride!’ - Marlen Reusser storms to Stage 4 win at Tour de France Femmes', + 'duration': 188.0, + 'display_id': 'vid1722221', + 'timestamp': 1658936167, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/07/27/3423347-69852108-2560-1440.jpg', + 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', + 'upload_date': '20220727', + } + }] + + _TOKEN = None + + # actually defined in https://netsport.eurosport.io/?variables={"databaseId":<databaseId>,"playoutType":"VDP"}&extensions={"persistedQuery":{"version":1 .. + # but this method require to get sha256 hash + _GEO_COUNTRIES = ['DE', 'NL', 'EU', 'IT', 'FR'] # Not complete list but it should work + + def _real_initialize(self): + if EurosportIE._TOKEN is None: + EurosportIE._TOKEN = self._download_json( + 'https://eu3-prod-direct.eurosport.com/token?realm=eurosport', None, + 'Trying to get token')['data']['attributes']['token'] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._download_json( + f'https://eu3-prod-direct.eurosport.com/playback/v2/videoPlaybackInfo/sourceSystemId/eurosport-{display_id}', + display_id, query={'usePreAuth': True}, headers={'Authorization': f'Bearer {EurosportIE._TOKEN}'})['data'] + + json_ld_data = self._search_json_ld(webpage, display_id) + + formats, subtitles = [], {} + for stream_type in json_data['attributes']['streaming']: + if stream_type == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4') + elif stream_type == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + elif stream_type == 'mss': + fmts, subs = self._extract_ism_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + self._sort_formats(formats) + + return { + 'id': json_data['id'], + 'title': json_ld_data.get('title') or self._og_search_title(webpage), + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': json_ld_data.get('thumbnails'), + 'description': (json_ld_data.get('description') + or self._html_search_meta(['og:description', 'description'], webpage)), + 'duration': json_ld_data.get('duration'), + 'timestamp': json_ld_data.get('timestamp'), + } From 9f9c85dda4953923d710ca9d24b2e433ec26e882 Mon Sep 17 00:00:00 2001 From: shirt <shirt@shirt.rip> Date: Wed, 31 Aug 2022 13:12:26 -0400 Subject: [PATCH 109/284] [Build] Update pyinstaller --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index efacecd3c9..45c5a43ccc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -194,7 +194,7 @@ jobs: - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python -m pip install --upgrade pip setuptools wheel py2exe - pip install "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.2-py3-none-any.whl" -r requirements.txt + pip install "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.3-py3-none-any.whl" -r requirements.txt - name: Prepare run: | @@ -230,7 +230,7 @@ jobs: - name: Install Requirements run: | python -m pip install --upgrade pip setuptools wheel - pip install "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-5.2-py3-none-any.whl" -r requirements.txt + pip install "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-5.3-py3-none-any.whl" -r requirements.txt - name: Prepare run: | From de49cdbe9d37a66b05bb73292cfba031847386dc Mon Sep 17 00:00:00 2001 From: Yifu Yu <root@jackyyf.com> Date: Thu, 1 Sep 2022 01:52:16 +0800 Subject: [PATCH 110/284] [extractor/bilibili] Extract `flac` with premium account (#4759) Authored by: jackyyf --- yt_dlp/extractor/bilibili.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 17c974d496..59f5791d1e 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -218,6 +218,9 @@ def _real_extract(self, url): durl = traverse_obj(video_info, ('dash', 'video')) audios = traverse_obj(video_info, ('dash', 'audio')) or [] + flac_audio = traverse_obj(video_info, ('dash', 'flac', 'audio')) + if flac_audio: + audios.append(flac_audio) entries = [] RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') From b2a4db425b02644353fdfbb9fe9df8c6ce7064ab Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 1 Sep 2022 02:12:34 -0500 Subject: [PATCH 111/284] [VQQ] Add extractors (#4706) Closes #1666 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 7 +- yt_dlp/extractor/tencent.py | 369 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/wetv.py | 208 ------------------ 3 files changed, 375 insertions(+), 209 deletions(-) create mode 100644 yt_dlp/extractor/tencent.py delete mode 100644 yt_dlp/extractor/wetv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4c033e5c0e..c49d2481c2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1735,6 +1735,12 @@ from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .tempo import TempoIE +from .tencent import ( + VQQSeriesIE, + VQQVideoIE, + WeTvEpisodeIE, + WeTvSeriesIE, +) from .tennistv import TennisTVIE from .tenplay import TenPlayIE from .testurl import TestURLIE @@ -2099,7 +2105,6 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE -from .wetv import WeTvEpisodeIE, WeTvSeriesIE from .wikimedia import WikimediaIE from .willow import WillowIE from .wimtv import WimTVIE diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py new file mode 100644 index 0000000000..c755407d3c --- /dev/null +++ b/yt_dlp/extractor/tencent.py @@ -0,0 +1,369 @@ +import functools +import random +import re +import string +import time + +from .common import InfoExtractor +from ..aes import aes_cbc_encrypt_bytes +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + js_to_json, + traverse_obj, + urljoin, +) + + +class TencentBaseIE(InfoExtractor): + """Subclasses must set _API_URL, _APP_VERSION, _PLATFORM, _HOST, _REFERER""" + + def _get_ckey(self, video_id, url, guid): + ua = self.get_param('http_headers')['User-Agent'] + + payload = (f'{video_id}|{int(time.time())}|mg3c3b04ba|{self._APP_VERSION}|{guid}|' + f'{self._PLATFORM}|{url[:48]}|{ua.lower()[:48]}||Mozilla|Netscape|Windows x86_64|00|') + + return aes_cbc_encrypt_bytes( + bytes(f'|{sum(map(ord, payload))}|{payload}', 'utf-8'), + b'Ok\xda\xa3\x9e/\x8c\xb0\x7f^r-\x9e\xde\xf3\x14', + b'\x01PJ\xf3V\xe6\x19\xcf.B\xbb\xa6\x8c?p\xf9', + padding_mode='whitespace').hex().upper() + + def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality): + guid = ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(16)]) + ckey = self._get_ckey(video_id, video_url, guid) + query = { + 'vid': video_id, + 'cid': series_id, + 'cKey': ckey, + 'encryptVer': '8.1', + 'spcaptiontype': '1' if subtitle_format == 'vtt' else '0', + 'sphls': '2' if video_format == 'hls' else '0', + 'dtype': '3' if video_format == 'hls' else '0', + 'defn': video_quality, + 'spsrt': '2', # Enable subtitles + 'sphttps': '1', # Enable HTTPS + 'otype': 'json', + 'spwm': '1', + # For SHD + 'host': self._HOST, + 'referer': self._REFERER, + 'ehost': video_url, + 'appVer': self._APP_VERSION, + 'platform': self._PLATFORM, + # For VQQ + 'guid': guid, + 'flowid': ''.join(random.choice(string.digits + string.ascii_lowercase) for _ in range(32)), + } + + return self._search_json(r'QZOutputJson=', self._download_webpage( + self._API_URL, video_id, query=query), 'api_response', video_id) + + def _extract_video_formats_and_subtitles(self, api_response, video_id): + video_response = api_response['vl']['vi'][0] + video_width, video_height = video_response.get('vw'), video_response.get('vh') + + formats, subtitles = [], {} + for video_format in video_response['ul']['ui']: + if video_format.get('hls'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_format['url'] + video_format['hls']['pt'], video_id, 'mp4', fatal=False) + for f in fmts: + f.update({'width': video_width, 'height': video_height}) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': f'{video_format["url"]}{video_response["fn"]}?vkey={video_response["fvkey"]}', + 'width': video_width, + 'height': video_height, + 'ext': 'mp4', + }) + + return formats, subtitles + + def _extract_video_native_subtitles(self, api_response, subtitles_format): + subtitles = {} + for subtitle in traverse_obj(api_response, ('sfl', 'fi')) or (): + subtitles.setdefault(subtitle['lang'].lower(), []).append({ + 'url': subtitle['url'], + 'ext': subtitles_format, + 'protocol': 'm3u8_native' if determine_ext(subtitle['url']) == 'm3u8' else 'http', + }) + + return subtitles + + def _extract_all_video_formats_and_subtitles(self, url, video_id, series_id): + formats, subtitles = [], {} + for video_format, subtitle_format, video_quality in ( + # '': 480p, 'shd': 720p, 'fhd': 1080p + ('mp4', 'srt', ''), ('hls', 'vtt', 'shd'), ('hls', 'vtt', 'fhd')): + api_response = self._get_video_api_response( + url, video_id, series_id, subtitle_format, video_format, video_quality) + + if api_response.get('em') != 0 and api_response.get('exem') != 0: + if '您所在区域暂无此内容版权' in api_response.get('msg'): + self.raise_geo_restricted() + raise ExtractorError(f'Tencent said: {api_response.get("msg")}') + + fmts, subs = self._extract_video_formats_and_subtitles(api_response, video_id) + native_subtitles = self._extract_video_native_subtitles(api_response, subtitle_format) + + formats.extend(fmts) + self._merge_subtitles(subs, native_subtitles, target=subtitles) + + self._sort_formats(formats) + return formats, subtitles + + def _get_clean_title(self, title): + return re.sub( + r'\s*[_\-]\s*(?:Watch online|腾讯视频|(?:高清)?1080P在线观看平台).*?$', + '', title or '').strip() or None + + +class VQQBaseIE(TencentBaseIE): + _VALID_URL_BASE = r'https?://v\.qq\.com' + + _API_URL = 'https://h5vv6.video.qq.com/getvinfo' + _APP_VERSION = '3.5.57' + _PLATFORM = '10901' + _HOST = 'v.qq.com' + _REFERER = 'v.qq.com' + + def _get_webpage_metadata(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'(?s)<script[^>]*>[^<]*window\.__pinia\s*=\s*([^<]+)</script>', + webpage, 'pinia data', fatal=False), + video_id, transform_source=js_to_json, fatal=False) + + +class VQQVideoIE(VQQBaseIE): + IE_NAME = 'vqq:video' + _VALID_URL = VQQBaseIE._VALID_URL_BASE + r'/x/(?:page|cover/(?P<series_id>\w+))/(?P<id>\w+)' + + _TESTS = [{ + 'url': 'https://v.qq.com/x/page/q326831cny0.html', + 'md5': '826ef93682df09e3deac4a6e6e8cdb6e', + 'info_dict': { + 'id': 'q326831cny0', + 'ext': 'mp4', + 'title': '我是选手:雷霆裂阵,终极时刻', + 'description': 'md5:e7ed70be89244017dac2a835a10aeb1e', + 'thumbnail': r're:^https?://[^?#]+q326831cny0', + }, + }, { + 'url': 'https://v.qq.com/x/page/o3013za7cse.html', + 'md5': 'b91cbbeada22ef8cc4b06df53e36fa21', + 'info_dict': { + 'id': 'o3013za7cse', + 'ext': 'mp4', + 'title': '欧阳娜娜VLOG', + 'description': 'md5:29fe847497a98e04a8c3826e499edd2e', + 'thumbnail': r're:^https?://[^?#]+o3013za7cse', + }, + }, { + 'url': 'https://v.qq.com/x/cover/7ce5noezvafma27/a00269ix3l8.html', + 'md5': '71459c5375c617c265a22f083facce67', + 'info_dict': { + 'id': 'a00269ix3l8', + 'ext': 'mp4', + 'title': '鸡毛飞上天 第01集', + 'description': 'md5:8cae3534327315b3872fbef5e51b5c5b', + 'thumbnail': r're:^https?://[^?#]+7ce5noezvafma27', + 'series': '鸡毛飞上天', + }, + }, { + 'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html', + 'md5': '96b9fd4a189fdd4078c111f21d7ac1bc', + 'info_dict': { + 'id': 's0043cwsgj0', + 'ext': 'mp4', + 'title': '第1集:如何快乐吃糖?', + 'description': 'md5:1d8c3a0b8729ae3827fa5b2d3ebd5213', + 'thumbnail': r're:^https?://[^?#]+s0043cwsgj0', + 'series': '青年理工工作者生活研究所', + }, + }] + + def _real_extract(self, url): + video_id, series_id = self._match_valid_url(url).group('id', 'series_id') + webpage = self._download_webpage(url, video_id) + webpage_metadata = self._get_webpage_metadata(webpage, video_id) + + formats, subtitles = self._extract_all_video_formats_and_subtitles(url, video_id, series_id) + return { + 'id': video_id, + 'title': self._get_clean_title(self._og_search_title(webpage) + or traverse_obj(webpage_metadata, ('global', 'videoInfo', 'title'))), + 'description': (self._og_search_description(webpage) + or traverse_obj(webpage_metadata, ('global', 'videoInfo', 'desc'))), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': (self._og_search_thumbnail(webpage) + or traverse_obj(webpage_metadata, ('global', 'videoInfo', 'pic160x90'))), + 'series': traverse_obj(webpage_metadata, ('global', 'coverInfo', 'title')), + } + + +class VQQSeriesIE(VQQBaseIE): + IE_NAME = 'vqq:series' + _VALID_URL = VQQBaseIE._VALID_URL_BASE + r'/x/cover/(?P<id>\w+)\.html/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://v.qq.com/x/cover/7ce5noezvafma27.html', + 'info_dict': { + 'id': '7ce5noezvafma27', + 'title': '鸡毛飞上天', + 'description': 'md5:8cae3534327315b3872fbef5e51b5c5b', + }, + 'playlist_count': 55, + }, { + 'url': 'https://v.qq.com/x/cover/oshd7r0vy9sfq8e.html', + 'info_dict': { + 'id': 'oshd7r0vy9sfq8e', + 'title': '恋爱细胞2', + 'description': 'md5:9d8a2245679f71ca828534b0f95d2a03', + }, + 'playlist_count': 12, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + webpage = self._download_webpage(url, series_id) + webpage_metadata = self._get_webpage_metadata(webpage, series_id) + + episode_paths = [f'/x/cover/{series_id}/{video_id}.html' for video_id in re.findall( + r'<div[^>]+data-vid="(?P<video_id>[^"]+)"[^>]+class="[^"]+episode-item-rect--number', + webpage)] + + return self.playlist_from_matches( + episode_paths, series_id, ie=VQQVideoIE, getter=functools.partial(urljoin, url), + title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) + or self._og_search_title(webpage)), + description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage))) + + +class WeTvBaseIE(TencentBaseIE): + _VALID_URL_BASE = r'https?://(?:www\.)?wetv\.vip/(?:[^?#]+/)?play' + + _API_URL = 'https://play.wetv.vip/getvinfo' + _APP_VERSION = '3.5.57' + _PLATFORM = '4830201' + _HOST = 'wetv.vip' + _REFERER = 'wetv.vip' + + def _get_webpage_metadata(self, webpage, video_id): + return self._parse_json( + traverse_obj(self._search_nextjs_data(webpage, video_id), ('props', 'pageProps', 'data')), + video_id, fatal=False) + + +class WeTvEpisodeIE(WeTvBaseIE): + IE_NAME = 'wetv:episode' + _VALID_URL = WeTvBaseIE._VALID_URL_BASE + r'/(?P<series_id>\w+)(?:-[^?#]+)?/(?P<id>\w+)(?:-[^?#]+)?' + + _TESTS = [{ + 'url': 'https://wetv.vip/en/play/air11ooo2rdsdi3-Cute-Programmer/v0040pr89t9-EP1-Cute-Programmer', + 'md5': '0c70fdfaa5011ab022eebc598e64bbbe', + 'info_dict': { + 'id': 'v0040pr89t9', + 'ext': 'mp4', + 'title': 'EP1: Cute Programmer', + 'description': 'md5:e87beab3bf9f392d6b9e541a63286343', + 'thumbnail': r're:^https?://[^?#]+air11ooo2rdsdi3', + 'series': 'Cute Programmer', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 2835, + }, + }, { + 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu/p0039b9nvik', + 'md5': '3b3c15ca4b9a158d8d28d5aa9d7c0a49', + 'info_dict': { + 'id': 'p0039b9nvik', + 'ext': 'mp4', + 'title': 'EP1: You Are My Glory', + 'description': 'md5:831363a4c3b4d7615e1f3854be3a123b', + 'thumbnail': r're:^https?://[^?#]+u37kgfnfzs73kiu', + 'series': 'You Are My Glory', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 2454, + }, + }, { + 'url': 'https://wetv.vip/en/play/lcxgwod5hapghvw-WeTV-PICK-A-BOO/i0042y00lxp-Zhao-Lusi-Describes-The-First-Experiences-She-Had-In-Who-Rules-The-World-%7C-WeTV-PICK-A-BOO', + 'md5': '71133f5c2d5d6cad3427e1b010488280', + 'info_dict': { + 'id': 'i0042y00lxp', + 'ext': 'mp4', + 'title': 'md5:f7a0857dbe5fbbe2e7ad630b92b54e6a', + 'description': 'md5:76260cb9cdc0ef76826d7ca9d92fadfa', + 'thumbnail': r're:^https?://[^?#]+lcxgwod5hapghvw', + 'series': 'WeTV PICK-A-BOO', + 'episode': 'Episode 0', + 'episode_number': 0, + 'duration': 442, + }, + }] + + def _real_extract(self, url): + video_id, series_id = self._match_valid_url(url).group('id', 'series_id') + webpage = self._download_webpage(url, video_id) + webpage_metadata = self._get_webpage_metadata(webpage, video_id) + + formats, subtitles = self._extract_all_video_formats_and_subtitles(url, video_id, series_id) + return { + 'id': video_id, + 'title': self._get_clean_title(self._og_search_title(webpage) + or traverse_obj(webpage_metadata, ('coverInfo', 'title'))), + 'description': (traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'duration'))), + 'series': traverse_obj(webpage_metadata, ('coverInfo', 'title')), + 'episode_number': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'episode'))), + } + + +class WeTvSeriesIE(WeTvBaseIE): + _VALID_URL = WeTvBaseIE._VALID_URL_BASE + r'/(?P<id>\w+)(?:-[^/?#]+)?/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://wetv.vip/play/air11ooo2rdsdi3-Cute-Programmer', + 'info_dict': { + 'id': 'air11ooo2rdsdi3', + 'title': 'Cute Programmer', + 'description': 'md5:e87beab3bf9f392d6b9e541a63286343', + }, + 'playlist_count': 30, + }, { + 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu-You-Are-My-Glory', + 'info_dict': { + 'id': 'u37kgfnfzs73kiu', + 'title': 'You Are My Glory', + 'description': 'md5:831363a4c3b4d7615e1f3854be3a123b', + }, + 'playlist_count': 32, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + webpage = self._download_webpage(url, series_id) + webpage_metadata = self._get_webpage_metadata(webpage, series_id) + + episode_paths = ([f'/play/{series_id}/{episode["vid"]}' for episode in webpage_metadata.get('videoList')] + or re.findall(r'<a[^>]+class="play-video__link"[^>]+href="(?P<path>[^"]+)', webpage)) + + return self.playlist_from_matches( + episode_paths, series_id, ie=WeTvEpisodeIE, getter=functools.partial(urljoin, url), + title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) + or self._og_search_title(webpage)), + description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage))) diff --git a/yt_dlp/extractor/wetv.py b/yt_dlp/extractor/wetv.py deleted file mode 100644 index ea2d0517ed..0000000000 --- a/yt_dlp/extractor/wetv.py +++ /dev/null @@ -1,208 +0,0 @@ -import functools -import re -import time - -from .common import InfoExtractor -from ..aes import aes_cbc_encrypt_bytes -from ..utils import determine_ext, int_or_none, traverse_obj, urljoin - - -class WeTvBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?wetv\.vip/(?:[^?#]+/)?play' - - def _get_ckey(self, video_id, url, app_version, platform): - ua = self.get_param('http_headers')['User-Agent'] - - payload = (f'{video_id}|{int(time.time())}|mg3c3b04ba|{app_version}|0000000000000000|' - f'{platform}|{url[:48]}|{ua.lower()[:48]}||Mozilla|Netscape|Win32|00|') - - return aes_cbc_encrypt_bytes( - bytes(f'|{sum(map(ord, payload))}|{payload}', 'utf-8'), - b'Ok\xda\xa3\x9e/\x8c\xb0\x7f^r-\x9e\xde\xf3\x14', - b'\x01PJ\xf3V\xe6\x19\xcf.B\xbb\xa6\x8c?p\xf9', - padding_mode='whitespace').hex() - - def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality): - app_version = '3.5.57' - platform = '4830201' - - ckey = self._get_ckey(video_id, video_url, app_version, platform) - query = { - 'vid': video_id, - 'cid': series_id, - 'cKey': ckey, - 'encryptVer': '8.1', - 'spcaptiontype': '1' if subtitle_format == 'vtt' else '0', # 0 - SRT, 1 - VTT - 'sphls': '1' if video_format == 'hls' else '0', # 0 - MP4, 1 - HLS - 'defn': video_quality, # '': 480p, 'shd': 720p, 'fhd': 1080p - 'spsrt': '1', # Enable subtitles - 'sphttps': '1', # Enable HTTPS - 'otype': 'json', # Response format: xml, json, - 'dtype': '1', - 'spwm': '1', - 'host': 'wetv.vip', # These three values are needed for SHD - 'referer': 'wetv.vip', - 'ehost': video_url, - 'appVer': app_version, - 'platform': platform, - } - - return self._search_json(r'QZOutputJson=', self._download_webpage( - 'https://play.wetv.vip/getvinfo', video_id, query=query), 'api_response', video_id) - - def _get_webpage_metadata(self, webpage, video_id): - return self._parse_json( - traverse_obj(self._search_nextjs_data(webpage, video_id), ('props', 'pageProps', 'data')), - video_id, fatal=False) - - -class WeTvEpisodeIE(WeTvBaseIE): - IE_NAME = 'wetv:episode' - _VALID_URL = WeTvBaseIE._VALID_URL_BASE + r'/(?P<series_id>\w+)(?:-[^?#]+)?/(?P<id>\w+)(?:-[^?#]+)?' - - _TESTS = [{ - 'url': 'https://wetv.vip/en/play/air11ooo2rdsdi3-Cute-Programmer/v0040pr89t9-EP1-Cute-Programmer', - 'md5': 'a046f565c9dce9b263a0465a422cd7bf', - 'info_dict': { - 'id': 'v0040pr89t9', - 'ext': 'mp4', - 'title': 'EP1: Cute Programmer', - 'description': 'md5:e87beab3bf9f392d6b9e541a63286343', - 'thumbnail': r're:^https?://[^?#]+air11ooo2rdsdi3', - 'series': 'Cute Programmer', - 'episode': 'Episode 1', - 'episode_number': 1, - 'duration': 2835, - }, - }, { - 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu/p0039b9nvik', - 'md5': '4d9d69bcfd11da61f4aae64fc6b316b3', - 'info_dict': { - 'id': 'p0039b9nvik', - 'ext': 'mp4', - 'title': 'EP1: You Are My Glory', - 'description': 'md5:831363a4c3b4d7615e1f3854be3a123b', - 'thumbnail': r're:^https?://[^?#]+u37kgfnfzs73kiu', - 'series': 'You Are My Glory', - 'episode': 'Episode 1', - 'episode_number': 1, - 'duration': 2454, - }, - }, { - 'url': 'https://wetv.vip/en/play/lcxgwod5hapghvw-WeTV-PICK-A-BOO/i0042y00lxp-Zhao-Lusi-Describes-The-First-Experiences-She-Had-In-Who-Rules-The-World-%7C-WeTV-PICK-A-BOO', - 'md5': '71133f5c2d5d6cad3427e1b010488280', - 'info_dict': { - 'id': 'i0042y00lxp', - 'ext': 'mp4', - 'title': 'md5:f7a0857dbe5fbbe2e7ad630b92b54e6a', - 'description': 'md5:76260cb9cdc0ef76826d7ca9d92fadfa', - 'thumbnail': r're:^https?://[^?#]+lcxgwod5hapghvw', - 'series': 'WeTV PICK-A-BOO', - 'episode': 'Episode 0', - 'episode_number': 0, - 'duration': 442, - }, - }] - - def _extract_video_formats_and_subtitles(self, api_response, video_id, video_quality): - video_response = api_response['vl']['vi'][0] - video_width = video_response.get('vw') - video_height = video_response.get('vh') - - formats, subtitles = [], {} - for video_format in video_response['ul']['ui']: - if video_format.get('hls'): - fmts, subs = self._extract_m3u8_formats_and_subtitles( - video_format['url'] + video_format['hls']['pname'], video_id, 'mp4', fatal=False) - for f in fmts: - f['width'] = video_width - f['height'] = video_height - - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - else: - formats.append({ - 'url': f'{video_format["url"]}{video_response["fn"]}?vkey={video_response["fvkey"]}', - 'width': video_width, - 'height': video_height, - 'ext': 'mp4', - }) - - return formats, subtitles - - def _extract_video_subtitles(self, api_response, subtitles_format): - subtitles = {} - for subtitle in traverse_obj(api_response, ('sfl', 'fi')): - subtitles.setdefault(subtitle['lang'].lower(), []).append({ - 'url': subtitle['url'], - 'ext': subtitles_format, - 'protocol': 'm3u8_native' if determine_ext(subtitle['url']) == 'm3u8' else 'http', - }) - - return subtitles - - def _real_extract(self, url): - video_id, series_id = self._match_valid_url(url).group('id', 'series_id') - webpage = self._download_webpage(url, video_id) - - formats, subtitles = [], {} - for video_format, subtitle_format, video_quality in (('mp4', 'srt', ''), ('hls', 'vtt', 'shd'), ('hls', 'vtt', 'fhd')): - api_response = self._get_video_api_response(url, video_id, series_id, subtitle_format, video_format, video_quality) - - fmts, subs = self._extract_video_formats_and_subtitles(api_response, video_id, video_quality) - native_subtitles = self._extract_video_subtitles(api_response, subtitle_format) - - formats.extend(fmts) - self._merge_subtitles(subs, native_subtitles, target=subtitles) - - self._sort_formats(formats) - webpage_metadata = self._get_webpage_metadata(webpage, video_id) - - return { - 'id': video_id, - 'title': (self._og_search_title(webpage) - or traverse_obj(webpage_metadata, ('coverInfo', 'description'))), - 'description': (self._og_search_description(webpage) - or traverse_obj(webpage_metadata, ('coverInfo', 'description'))), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnail': self._og_search_thumbnail(webpage), - 'duration': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'duration'))), - 'series': traverse_obj(webpage_metadata, ('coverInfo', 'title')), - 'episode_number': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'episode'))), - } - - -class WeTvSeriesIE(WeTvBaseIE): - _VALID_URL = WeTvBaseIE._VALID_URL_BASE + r'/(?P<id>\w+)(?:-[^/?#]+)?/?(?:[?#]|$)' - - _TESTS = [{ - 'url': 'https://wetv.vip/play/air11ooo2rdsdi3-Cute-Programmer', - 'info_dict': { - 'id': 'air11ooo2rdsdi3', - 'title': 'Cute Programmer', - 'description': 'md5:e87beab3bf9f392d6b9e541a63286343', - }, - 'playlist_count': 30, - }, { - 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu-You-Are-My-Glory', - 'info_dict': { - 'id': 'u37kgfnfzs73kiu', - 'title': 'You Are My Glory', - 'description': 'md5:831363a4c3b4d7615e1f3854be3a123b', - }, - 'playlist_count': 32, - }] - - def _real_extract(self, url): - series_id = self._match_id(url) - webpage = self._download_webpage(url, series_id) - webpage_metadata = self._get_webpage_metadata(webpage, series_id) - - episode_paths = (re.findall(r'<a[^>]+class="play-video__link"[^>]+href="(?P<path>[^"]+)', webpage) - or [f'/{series_id}/{episode["vid"]}' for episode in webpage_metadata.get('videoList')]) - - return self.playlist_from_matches( - episode_paths, series_id, ie=WeTvEpisodeIE, getter=functools.partial(urljoin, url), - title=traverse_obj(webpage_metadata, ('coverInfo', 'title')) or self._og_search_title(webpage), - description=traverse_obj(webpage_metadata, ('coverInfo', 'description')) or self._og_search_description(webpage)) From 92aa6d688358ab4f328d37e66f0db3c54d7ab89b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Sep 2022 09:50:54 +0000 Subject: [PATCH 112/284] [extractor/triller] Add extractor (#4712) Closes #4703 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/triller.py | 304 ++++++++++++++++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 yt_dlp/extractor/triller.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c49d2481c2..8368e9315b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1800,6 +1800,10 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE +from .triller import ( + TrillerIE, + TrillerUserIE, +) from .trilulilu import TriluliluIE from .trovo import ( TrovoIE, diff --git a/yt_dlp/extractor/triller.py b/yt_dlp/extractor/triller.py new file mode 100644 index 0000000000..c199da91da --- /dev/null +++ b/yt_dlp/extractor/triller.py @@ -0,0 +1,304 @@ +import itertools +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + url_basename, + ExtractorError, +) + + +class TrillerBaseIE(InfoExtractor): + _NETRC_MACHINE = 'triller' + _AUTH_TOKEN = None + _API_BASE_URL = 'https://social.triller.co/v1.5' + + def _perform_login(self, username, password): + if self._AUTH_TOKEN: + return + + user_check = self._download_json( + f'{self._API_BASE_URL}/api/user/is-valid-username', None, note='Checking username', + fatal=False, expected_status=400, headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://triller.co', + }, data=json.dumps({'username': username}, separators=(',', ':')).encode('utf-8')) + if user_check.get('status'): # endpoint returns "status":false if username exists + raise ExtractorError('Unable to login: Invalid username', expected=True) + + credentials = { + 'username': username, + 'password': password, + } + login = self._download_json( + f'{self._API_BASE_URL}/user/auth', None, note='Logging in', + fatal=False, expected_status=400, headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://triller.co', + }, data=json.dumps(credentials, separators=(',', ':')).encode('utf-8')) + if not login.get('auth_token'): + if login.get('error') == 1008: + raise ExtractorError('Unable to login: Incorrect password', expected=True) + raise ExtractorError('Unable to login') + + self._AUTH_TOKEN = login['auth_token'] + + def _get_comments(self, video_id, limit=15): + comment_info = self._download_json( + f'{self._API_BASE_URL}/api/videos/{video_id}/comments_v2', + video_id, fatal=False, note='Downloading comments API JSON', + headers={'Origin': 'https://triller.co'}, query={'limit': limit}) or {} + if not comment_info.get('comments'): + return + for comment_dict in comment_info['comments']: + yield { + 'author': traverse_obj(comment_dict, ('author', 'username')), + 'author_id': traverse_obj(comment_dict, ('author', 'user_id')), + 'id': comment_dict.get('id'), + 'text': comment_dict.get('body'), + 'timestamp': unified_timestamp(comment_dict.get('timestamp')), + } + + def _check_user_info(self, user_info): + if not user_info: + self.report_warning('Unable to extract user info') + elif user_info.get('private') and not user_info.get('followed_by_me'): + raise ExtractorError('This video is private', expected=True) + elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'): + raise ExtractorError('The author of the video is blocked', expected=True) + return user_info + + def _parse_video_info(self, video_info, username, user_info=None): + video_uuid = video_info.get('video_uuid') + video_id = video_info.get('id') + + formats = [] + video_url = traverse_obj(video_info, 'video_url', 'stream_url') + if video_url: + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'vcodec': 'h264', + 'width': video_info.get('width'), + 'height': video_info.get('height'), + 'format_id': url_basename(video_url).split('.')[0], + 'filesize': video_info.get('filesize'), + }) + video_set = video_info.get('video_set') or [] + for video in video_set: + resolution = video.get('resolution') or '' + formats.append({ + 'url': video['url'], + 'ext': 'mp4', + 'vcodec': video.get('codec'), + 'vbr': int_or_none(video.get('bitrate'), 1000), + 'width': int_or_none(resolution.split('x')[0]), + 'height': int_or_none(resolution.split('x')[1]), + 'format_id': url_basename(video['url']).split('.')[0], + }) + audio_url = video_info.get('audio_url') + if audio_url: + formats.append({ + 'url': audio_url, + 'ext': 'm4a', + 'format_id': url_basename(audio_url).split('.')[0], + }) + + manifest_url = video_info.get('transcoded_url') + if manifest_url: + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + comment_count = int_or_none(video_info.get('comment_count')) + + user_info = user_info or traverse_obj(video_info, 'user', default={}) + + return { + 'id': str_or_none(video_id) or video_uuid, + 'title': video_info.get('description') or f'Video by {username}', + 'thumbnail': video_info.get('thumbnail_url'), + 'description': video_info.get('description'), + 'uploader': str_or_none(username), + 'uploader_id': str_or_none(user_info.get('user_id')), + 'creator': str_or_none(user_info.get('name')), + 'timestamp': unified_timestamp(video_info.get('timestamp')), + 'upload_date': unified_strdate(video_info.get('timestamp')), + 'duration': int_or_none(video_info.get('duration')), + 'view_count': int_or_none(video_info.get('play_count')), + 'like_count': int_or_none(video_info.get('likes_count')), + 'artist': str_or_none(video_info.get('song_artist')), + 'track': str_or_none(video_info.get('song_title')), + 'webpage_url': f'https://triller.co/@{username}/video/{video_uuid}', + 'uploader_url': f'https://triller.co/@{username}', + 'extractor_key': TrillerIE.ie_key(), + 'extractor': TrillerIE.IE_NAME, + 'formats': formats, + 'comment_count': comment_count, + '__post_extractor': self.extract_comments(video_id, comment_count), + } + + +class TrillerIE(TrillerBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?triller\.co/ + @(?P<username>[\w\._]+)/video/ + (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + ''' + _TESTS = [{ + 'url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', + 'md5': '228662d783923b60d78395fedddc0a20', + 'info_dict': { + 'id': '71595734', + 'ext': 'mp4', + 'title': 'md5:9a2bf9435c5c4292678996a464669416', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'md5:9a2bf9435c5c4292678996a464669416', + 'uploader': 'theestallion', + 'uploader_id': '18992236', + 'creator': 'Megan Thee Stallion', + 'timestamp': 1660598222, + 'upload_date': '20220815', + 'duration': 47, + 'height': 3840, + 'width': 2160, + 'view_count': int, + 'like_count': int, + 'artist': 'Megan Thee Stallion', + 'track': 'Her', + 'webpage_url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', + 'uploader_url': 'https://triller.co/@theestallion', + 'comment_count': int, + } + }, { + 'url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', + 'md5': '874055f462af5b0699b9dbb527a505a0', + 'info_dict': { + 'id': '71621339', + 'ext': 'mp4', + 'title': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', + 'uploader': 'charlidamelio', + 'uploader_id': '1875551', + 'creator': 'charli damelio', + 'timestamp': 1660773354, + 'upload_date': '20220817', + 'duration': 16, + 'height': 1920, + 'width': 1080, + 'view_count': int, + 'like_count': int, + 'artist': 'Dixie', + 'track': 'Someone to Blame', + 'webpage_url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', + 'uploader_url': 'https://triller.co/@charlidamelio', + 'comment_count': int, + } + }] + + def _real_extract(self, url): + username, video_uuid = self._match_valid_url(url).group('username', 'id') + + video_info = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/api/videos/{video_uuid}', + video_uuid, note='Downloading video info API JSON', + errnote='Unable to download video info API JSON', + headers={ + 'Origin': 'https://triller.co', + }), ('videos', 0)) + if not video_info: + raise ExtractorError('No video info found in API response') + + user_info = self._check_user_info(video_info.get('user') or {}) + return self._parse_video_info(video_info, username, user_info) + + +class TrillerUserIE(TrillerBaseIE): + _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w\._]+)/?(?:$|[#?])' + _TESTS = [{ + # first videos request only returns 2 videos + 'url': 'https://triller.co/@theestallion', + 'playlist_mincount': 9, + 'info_dict': { + 'id': '18992236', + 'title': 'theestallion', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + } + }, { + 'url': 'https://triller.co/@charlidamelio', + 'playlist_mincount': 25, + 'info_dict': { + 'id': '1875551', + 'title': 'charlidamelio', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + } + }] + + def _real_initialize(self): + if not self._AUTH_TOKEN: + guest = self._download_json( + f'{self._API_BASE_URL}/user/create_guest', + None, note='Creating guest session', data=b'', headers={ + 'Origin': 'https://triller.co', + }, query={ + 'platform': 'Web', + 'app_version': '', + }) + if not guest.get('auth_token'): + raise ExtractorError('Unable to fetch required auth token for user extraction') + + self._AUTH_TOKEN = guest['auth_token'] + + def _extract_video_list(self, username, user_id, limit=6): + query = { + 'limit': limit, + } + for page in itertools.count(1): + for retry in self.RetryManager(): + try: + video_list = self._download_json( + f'{self._API_BASE_URL}/api/users/{user_id}/videos', + username, note=f'Downloading user video list page {page}', + errnote='Unable to download user video list', headers={ + 'Authorization': f'Bearer {self._AUTH_TOKEN}', + 'Origin': 'https://triller.co', + }, query=query) + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: + retry.error = e + continue + raise + if not video_list.get('videos'): + break + yield from video_list['videos'] + query['before_time'] = traverse_obj(video_list, ('videos', -1, 'timestamp')) + if not query['before_time']: + break + + def _entries(self, videos, username, user_info): + for video in videos: + yield self._parse_video_info(video, username, user_info) + + def _real_extract(self, url): + username = self._match_id(url) + user_info = self._check_user_info(self._download_json( + f'{self._API_BASE_URL}/api/users/by_username/{username}', + username, note='Downloading user info', + errnote='Failed to download user info', headers={ + 'Authorization': f'Bearer {self._AUTH_TOKEN}', + 'Origin': 'https://triller.co', + }).get('user', {})) + + user_id = str_or_none(user_info.get('user_id')) + videos = self._extract_video_list(username, user_id) + thumbnail = user_info.get('avatar_url') + + return self.playlist_result( + self._entries(videos, username, user_info), user_id, username, thumbnail=thumbnail) From 825d3ce386e66ac0c73e41e352d84053f9f0e624 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Sep 2022 09:52:59 +0000 Subject: [PATCH 113/284] [cookies] Improve container support (#4806) Closes #4800 Authored by: bashonly, pukkandan, coletdjnz --- README.md | 27 ++++++++++++++------------- yt_dlp/__init__.py | 14 ++++++++------ yt_dlp/cookies.py | 28 ++++++++++++++-------------- yt_dlp/options.py | 13 +++++++------ 4 files changed, 43 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index c101048d5a..8965089652 100644 --- a/README.md +++ b/README.md @@ -706,19 +706,20 @@ ## Filesystem Options: and dump cookie jar in --no-cookies Do not read/dump cookies from/to file (default) - --cookies-from-browser BROWSER[+KEYRING][:PROFILE[:CONTAINER]] - The name of the browser and (optionally) the - name/path of the profile to load cookies - from (and container name if Firefox) - separated by a ":". Currently supported - browsers are: brave, chrome, chromium, edge, - firefox, opera, safari, vivaldi. By default, - the default container of the most recently - accessed profile is used. The keyring used - for decrypting Chromium cookies on Linux can - be (optionally) specified after the browser - name separated by a "+". Currently supported - keyrings are: basictext, gnomekeyring, kwallet + --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] + The name of the browser to load cookies + from. Currently supported browsers are: + brave, chrome, chromium, edge, firefox, + opera, safari, vivaldi. Optionally, the + KEYRING used for decrypting Chromium cookies + on Linux, the name/path of the PROFILE to + load cookies from, and the CONTAINER name + (if Firefox) ("none" for no container) can + be given with their respective seperators. + By default, all containers of the most + recently accessed profile are used. + Currently supported keyrings are: basictext, + gnomekeyring, kwallet --no-cookies-from-browser Do not load cookies from browser (default) --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information (such diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index f4a2086ce2..552f29bd96 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -347,23 +347,25 @@ def parse_chapters(name, value): # Cookies from browser if opts.cookiesfrombrowser: container = None - mobj = re.match(r'(?P<name>[^+:]+)(\s*\+\s*(?P<keyring>[^:]+))?(\s*:(?P<profile>.+))?', opts.cookiesfrombrowser) + mobj = re.fullmatch(r'''(?x) + (?P<name>[^+:]+) + (?:\s*\+\s*(?P<keyring>[^:]+))? + (?:\s*:\s*(?P<profile>.+?))? + (?:\s*::\s*(?P<container>.+))? + ''', opts.cookiesfrombrowser) if mobj is None: raise ValueError(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}') - browser_name, keyring, profile = mobj.group('name', 'keyring', 'profile') + browser_name, keyring, profile, container = mobj.group('name', 'keyring', 'profile', 'container') browser_name = browser_name.lower() if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser specified for cookies: "{browser_name}". ' f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}') - elif profile and browser_name == 'firefox': - if ':' in profile and not os.path.exists(profile): - profile, container = profile.split(':', 1) if keyring is not None: keyring = keyring.upper() if keyring not in SUPPORTED_KEYRINGS: raise ValueError(f'unsupported keyring specified for cookies: "{keyring}". ' f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') - opts.cookiesfrombrowser = (browser_name, profile, keyring, container) + opts.cookiesfrombrowser = (browser_name, profile or None, keyring, container or None) # MetadataParser def metadataparser_actions(f): diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index c5fb5ab68c..9100f46ac2 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -128,9 +128,14 @@ def _extract_firefox_cookies(profile, container, logger): else: search_root = os.path.join(_firefox_browser_dir(), profile) + cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger) + if cookie_database_path is None: + raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') + container_id = None - if container is not None: - containers_path = os.path.join(search_root, 'containers.json') + if container not in (None, 'none'): + containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): raise FileNotFoundError(f'could not read containers.json in {search_root}') with open(containers_path, 'r') as containers: @@ -142,26 +147,21 @@ def _extract_firefox_cookies(profile, container, logger): if not isinstance(container_id, int): raise ValueError(f'could not find firefox container "{container}" in containers.json') - cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger) - if cookie_database_path is None: - raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') - logger.debug(f'Extracting cookies from: "{cookie_database_path}"') - with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) - origin_attributes = '' if isinstance(container_id, int): - origin_attributes = f'^userContextId={container_id}' logger.debug( f'Only loading cookies from firefox container "{container}", ID {container_id}') - try: cursor.execute( - 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE originAttributes=?', - (origin_attributes, )) - except sqlite3.OperationalError: - logger.debug('Database exception, loading all cookies') + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE originAttributes LIKE ? OR originAttributes LIKE ?', + (f'%userContextId={container_id}', f'%userContextId={container_id}&%')) + elif container == 'none': + logger.debug('Only loading cookies not belonging to any container') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE NOT INSTR(originAttributes,"userContextId=")') + else: cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') jar = YoutubeDLCookieJar() with _create_progress_bar(logger) as progress_bar: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index e50ecc579c..da6b1d25b0 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1400,14 +1400,15 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not read/dump cookies from/to file (default)') filesystem.add_option( '--cookies-from-browser', - dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE[:CONTAINER]]', + dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE][::CONTAINER]', help=( - 'The name of the browser and (optionally) the name/path of the profile to load cookies from ' - '(and container name if Firefox) separated by a ":". ' + 'The name of the browser to load cookies from. ' f'Currently supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}. ' - 'By default, the default container of the most recently accessed profile is used. ' - 'The keyring used for decrypting Chromium cookies on Linux can be ' - '(optionally) specified after the browser name separated by a "+". ' + 'Optionally, the KEYRING used for decrypting Chromium cookies on Linux, ' + 'the name/path of the PROFILE to load cookies from, ' + 'and the CONTAINER name (if Firefox) ("none" for no container) ' + 'can be given with their respective seperators. ' + 'By default, all containers of the most recently accessed profile are used. ' f'Currently supported keyrings are: {", ".join(map(str.lower, sorted(SUPPORTED_KEYRINGS)))}')) filesystem.add_option( '--no-cookies-from-browser', From 1ff88b7aec76bc8396c58f4757e2c08b20e5533e Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Thu, 1 Sep 2022 10:02:28 +0000 Subject: [PATCH 114/284] [extractor/youtube] Add `no-youtube-prefer-utc-upload-date` compat option (#4771) This option reverts https://github.com/yt-dlp/yt-dlp/commit/992f9a730b49fd36fc422be8d802f98ebcdce418 and https://github.com/yt-dlp/yt-dlp/commit/17322130a954577bb03b833d5c435638e51e19f2 to prefer the non-UTC upload date in microformats. Authored by: coletdjnz, pukkandan --- README.md | 1 + yt_dlp/extractor/youtube.py | 36 +++++++++++++++++++++++++++++++++++- yt_dlp/options.py | 1 + 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8965089652..83ab309c6c 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ ### Differences in default behavior * Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading * Youtube channel URLs are automatically redirected to `/video`. Append a `/featured` to the URL to download only the videos in the home page. If the channel does not have a videos tab, we try to download the equivalent `UU` playlist instead. For all other tabs, if the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections * Unavailable videos are also listed for youtube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this +* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead * Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ee9cce16e7..b1eda0d07f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2159,6 +2159,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_follower_count': int } + }, { + # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date + 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4', + 'info_dict': { + 'id': '2NUZ8W2llS4', + 'ext': 'mp4', + 'title': 'The NP that test your phone performance 🙂', + 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', + 'uploader': 'Leon Nguyen', + 'uploader_id': 'VNSXIII', + 'uploader_url': 'http://www.youtube.com/user/VNSXIII', + 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', + 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', + 'duration': 21, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Gaming'], + 'tags': 'count:23', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'upload_date': '20220102', + 'like_count': int, + 'availability': 'public', + 'channel': 'Leon Nguyen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'comment_count': int, + 'channel_follower_count': int + }, + 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { # date text is premiered video, ensure upload date in UTC (published 1641172509) 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM', @@ -3920,7 +3949,12 @@ def process_language(container, base_url, lang_code, sub_name, query): upload_date = ( unified_strdate(get_first(microformats, 'uploadDate')) or unified_strdate(search_meta('uploadDate'))) - if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'): + if not upload_date or ( + not info.get('is_live') + and not info.get('was_live') + and info.get('live_status') != 'is_upcoming' + and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) + ): upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') or upload_date info['upload_date'] = upload_date diff --git a/yt_dlp/options.py b/yt_dlp/options.py index da6b1d25b0..0fbf1f028b 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -444,6 +444,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', + 'no-youtube-prefer-utc-upload-date' }, 'aliases': { 'youtube-dl': ['all', '-multistreams'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], From 50a399326fa82e2e5fe3f2829da5a31407adafaa Mon Sep 17 00:00:00 2001 From: satan1st <satan1st@users.noreply.github.com> Date: Thu, 1 Sep 2022 13:16:17 +0200 Subject: [PATCH 115/284] [build] `make tar' should not follow `DESTDIR` (#4790) Ref: https://www.gnu.org/prep/standards/html_node/DESTDIR.html Authored by: satan1st --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d6a00d332b..6cb9e2f57e 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,6 @@ completion-zsh: completions/zsh/_yt-dlp lazy-extractors: yt_dlp/extractor/lazy_extractors.py PREFIX ?= /usr/local -DESTDIR ?= . BINDIR ?= $(PREFIX)/bin MANDIR ?= $(PREFIX)/man SHAREDIR ?= $(PREFIX)/share @@ -134,7 +133,7 @@ yt_dlp/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscrip $(PYTHON) devscripts/make_lazy_extractors.py $@ yt-dlp.tar.gz: all - @tar -czf $(DESTDIR)/yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \ + @tar -czf yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ --exclude '*.pyc' \ From f2e9fa3ef7a7ce8e18cec53ea7956a3bb36c59ea Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 31 Aug 2022 22:49:14 +0530 Subject: [PATCH 116/284] [FormatSort] Fix `aext` for `--prefer-free-formats` Closes #4735 --- README.md | 2 +- yt_dlp/extractor/common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 83ab309c6c..176832ca99 100644 --- a/README.md +++ b/README.md @@ -1530,7 +1530,7 @@ ## Sorting Formats - `acodec`: Audio Codec (`flac`/`alac` > `wav`/`aiff` > `opus` > `vorbis` > `aac` > `mp4a` > `mp3` > `eac3` > `ac3` > `dts` > other) - `codec`: Equivalent to `vcodec,acodec` - `vext`: Video Extension (`mp4` > `webm` > `flv` > other). If `--prefer-free-formats` is used, `webm` is preferred. - - `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other). If `--prefer-free-formats` is used, the order changes to `opus` > `ogg` > `webm` > `m4a` > `mp3` > `aac`. + - `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other). If `--prefer-free-formats` is used, the order changes to `ogg` > `opus` > `webm` > `mp3` > `m4a` > `aac` - `ext`: Equivalent to `vext,aext` - `filesize`: Exact filesize, if known in advance - `fs_approx`: Approximate filesize calculated from the manifests diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b792219553..b9d0305b4d 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1693,7 +1693,7 @@ class FormatSort: 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, 'aext': {'type': 'ordered', 'field': 'audio_ext', 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), - 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')}, + 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')}, 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', 'field': ('vcodec', 'acodec'), From b505e8517ad2ca8e07d5f9577dfd9a96165beaa0 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 1 Sep 2022 13:38:25 +0530 Subject: [PATCH 117/284] [extractor/youtube] Fallback regex for nsig code extraction --- yt_dlp/extractor/youtube.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b1eda0d07f..9303557f76 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2661,7 +2661,10 @@ def _decrypt_nsig(self, s, video_id, player_url): raise ExtractorError('Cannot decrypt nsig without player_url') player_url = urljoin('https://www.youtube.com', player_url) - jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) + try: + jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) + except ExtractorError as e: + raise ExtractorError('Unable to extract nsig function code', cause=e) if self.get_param('youtube_print_sig_code'): self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') @@ -2706,7 +2709,20 @@ def _extract_n_function_code(self, video_id, player_url): if func_code: return jsi, player_id, func_code - func_code = jsi.extract_function_code(self._extract_n_function_name(jscode)) + func_name = self._extract_n_function_name(jscode) + + # For redundancy + func_code = self._search_regex( + r'''(?xs)%s\s*=\s*function\s*\((?P<var>[\w$]+)\)\s* + # NB: The end of the regex is intentionally kept strict + {(?P<code>.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name, + jscode, 'nsig function', group=('var', 'code'), default=None) + if func_code: + func_code = ([func_code[0]], func_code[1]) + else: + self.write_debug('Extracting nsig function with jsinterp') + func_code = jsi.extract_function_code(func_name) + self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code From 05deb747bb18febb803b47119ca7bc432ffb80c8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 1 Sep 2022 13:14:04 +0530 Subject: [PATCH 118/284] [jsinterp] Fix escape in regex --- test/test_jsinterp.py | 5 +++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 2 +- yt_dlp/jsinterp.py | 11 ++++++----- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 4b6e22bac2..0cdf726fbe 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -352,6 +352,11 @@ def test_regex(self): ''') self.assertEqual(jsi.call_function('x').flags & re.I, re.I) + jsi = JSInterpreter(''' + function x() { let a=/,][}",],()}(\[)/; return a; } + ''') + self.assertEqual(jsi.call_function('x').pattern, r',][}",],()}(\[)') + def test_char_code_at(self): jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('x', 0), 116) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 717c949540..b1c5cb2b35 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -122,6 +122,10 @@ 'https://www.youtube.com/s/player/113ca41c/player_ias.vflset/en_US/base.js', 'cgYl-tlYkhjT7A', 'hI7BBr2zUgcmMg', ), + ( + 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', + 'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 9303557f76..2748b5dc52 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2702,7 +2702,7 @@ def _extract_n_function_name(self, jscode): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.08.19.2') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.09.1') jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 51c7beed43..27d7f0dfa6 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -245,11 +245,12 @@ def _separate(expr, delim=',', max_split=None): counters[_MATCHING_PARENS[char]] += 1 elif not in_quote and char in counters: counters[char] -= 1 - elif not escaping and char in _QUOTES and in_quote in (char, None): - if in_quote or after_op or char != '/': - in_quote = None if in_quote and not in_regex_char_group else char - elif in_quote == '/' and char in '[]': - in_regex_char_group = char == '[' + elif not escaping: + if char in _QUOTES and in_quote in (char, None): + if in_quote or after_op or char != '/': + in_quote = None if in_quote and not in_regex_char_group else char + elif in_quote == '/' and char in '[]': + in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op) From 1ac7f461845b3f9c0c3a2e6a1308bf82d3e8e55a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 1 Sep 2022 16:23:18 +0530 Subject: [PATCH 119/284] Update to ytdl-commit-ed5c44e7 [compat] Replace deficient ChainMap class in Py3.3 and earlier https://github.com/ytdl-org/youtube-dl/commit/ed5c44e7b74ac77f87ca5ed6cb5e964a0c6a0678 --- README.md | 2 +- test/test_jsinterp.py | 35 +++++++++++++++++ yt_dlp/YoutubeDL.py | 8 ++-- yt_dlp/jsinterp.py | 91 ++++++++++++++++++++++++------------------- 4 files changed, 92 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 176832ca99..c4667bb572 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/b0a60ce](https://github.com/ytdl-org/youtube-dl/commit/b0a60ce2032172aeaaf27fe3866ab72768f10cb2)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17+ [commit/ed5c44e](https://github.com/ytdl-org/youtube-dl/commit/ed5c44e7b74ac77f87ca5ed6cb5e964a0c6a0678)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 0cdf726fbe..b46d0949d4 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -71,6 +71,9 @@ def test_operators(self): jsi = JSInterpreter('function f(){return 0 ?? 42;}') self.assertEqual(jsi.call_function('f'), 0) + jsi = JSInterpreter('function f(){return "life, the universe and everything" < 42;}') + self.assertFalse(jsi.call_function('f')) + def test_array_access(self): jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) @@ -193,6 +196,30 @@ def test_try(self): ''') self.assertEqual(jsi.call_function('x'), 10) + def test_catch(self): + jsi = JSInterpreter(''' + function x() { try{throw 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 5) + + def test_finally(self): + jsi = JSInterpreter(''' + function x() { try{throw 10} finally {return 42} } + ''') + self.assertEqual(jsi.call_function('x'), 42) + jsi = JSInterpreter(''' + function x() { try{throw 10} catch(e){return 5} finally {return 42} } + ''') + self.assertEqual(jsi.call_function('x'), 42) + + def test_nested_try(self): + jsi = JSInterpreter(''' + function x() {try { + try{throw 10} finally {throw 42} + } catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 5) + def test_for_loop_continue(self): jsi = JSInterpreter(''' function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } @@ -205,6 +232,14 @@ def test_for_loop_break(self): ''') self.assertEqual(jsi.call_function('x'), 0) + def test_for_loop_try(self): + jsi = JSInterpreter(''' + function x() { + for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; + return 42 } + ''') + self.assertEqual(jsi.call_function('x'), 42) + def test_literal_list(self): jsi = JSInterpreter(''' function x() { return [1, 2, "asdf", [5, 6, 7]][3] } diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 10c17ea007..2b5b3fdfc5 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2528,9 +2528,6 @@ def sanitize_numeric_fields(info): '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' 'If you want to download from the current time, use --no-live-from-start')) - if not formats: - self.raise_no_formats(info_dict) - def is_wellformed(f): url = f.get('url') if not url: @@ -2543,7 +2540,10 @@ def is_wellformed(f): return True # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + self.raise_no_formats(info_dict) formats_dict = {} diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 27d7f0dfa6..2bb4acf3e7 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -72,6 +72,8 @@ def _js_comp_op(op): def wrapped(a, b): if JS_Undefined in (a, b): return False + if isinstance(a, str) or isinstance(b, str): + return op(str(a or 0), str(b or 0)) return op(a or 0, b or 0) return wrapped @@ -268,7 +270,9 @@ def _separate(expr, delim=',', max_split=None): yield expr[start:] @classmethod - def _separate_at_paren(cls, expr, delim): + def _separate_at_paren(cls, expr, delim=None): + if delim is None: + delim = expr and _MATCHING_PARENS[expr[0]] separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: raise cls.Exception(f'No terminating paren {delim}', expr) @@ -347,7 +351,7 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): if expr.startswith('new '): obj = expr[4:] if obj.startswith('Date('): - left, right = self._separate_at_paren(obj[4:], ')') + left, right = self._separate_at_paren(obj[4:]) expr = unified_timestamp( self.interpret_expression(left, local_vars, allow_recursion), False) if not expr: @@ -361,8 +365,8 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): return None, should_return if expr.startswith('{'): - inner, outer = self._separate_at_paren(expr, '}') - # Look for Map first + inner, outer = self._separate_at_paren(expr) + # try for object expression (Map) sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] if all(len(sub_expr) == 2 for sub_expr in sub_expressions): def dict_item(key, val): @@ -380,7 +384,7 @@ def dict_item(key, val): expr = self._dump(inner, local_vars) + outer if expr.startswith('('): - inner, outer = self._separate_at_paren(expr, ')') + inner, outer = self._separate_at_paren(expr) inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: return inner, should_abort or should_return @@ -388,53 +392,62 @@ def dict_item(key, val): expr = self._dump(inner, local_vars) + outer if expr.startswith('['): - inner, outer = self._separate_at_paren(expr, ']') + inner, outer = self._separate_at_paren(expr) name = self._named_object(local_vars, [ self.interpret_expression(item, local_vars, allow_recursion) for item in self._separate(inner)]) expr = name + outer - m = re.match(rf'''(?x) - (?P<try>try|finally)\s*| - (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))| - (?P<switch>switch)\s*\(| - (?P<for>for)\s*\(|''', expr) - if m and m.group('try'): - if expr[m.end()] == '{': - try_expr, expr = self._separate_at_paren(expr[m.end():], '}') - else: - try_expr, expr = expr[m.end() - 1:], '' + m = re.match(r'''(?x) + (?P<try>try)\s*\{| + (?P<switch>switch)\s*\(| + (?P<for>for)\s*\( + ''', expr) + md = m.groupdict() if m else {} + if md.get('try'): + try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + err = None try: ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) if should_abort: return ret, True - except JS_Throw as e: - local_vars[self._EXC_NAME] = e.error except Exception as e: # XXX: This works for now, but makes debugging future issues very hard - local_vars[self._EXC_NAME] = e - ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) - return ret, should_abort or should_return + err = e - elif m and m.group('catch'): - catch_expr, expr = self._separate_at_paren(expr[m.end():], '}') - if self._EXC_NAME in local_vars: - catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)}) - ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion) + pending = (None, False) + m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if err: + catch_vars = {} + if m.group('err'): + catch_vars[m.group('err')] = err.error if isinstance(err, JS_Throw) else err + catch_vars = local_vars.new_child(catch_vars) + err, pending = None, self.interpret_statement(sub_expr, catch_vars, allow_recursion) + + m = re.match(r'finally\s*\{', expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) if should_abort: return ret, True - ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) - return ret, should_abort or should_return + ret, should_abort = pending + if should_abort: + return ret, True - elif m and m.group('for'): - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + if err: + raise err + + elif md.get('for'): + constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) if remaining.startswith('{'): - body, expr = self._separate_at_paren(remaining, '}') + body, expr = self._separate_at_paren(remaining) else: switch_m = re.match(r'switch\s*\(', remaining) # FIXME if switch_m: - switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:], ')') + switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') body = 'switch(%s){%s}' % (switch_val, body) else: @@ -453,11 +466,9 @@ def dict_item(key, val): except JS_Continue: pass self.interpret_expression(increment, local_vars, allow_recursion) - ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) - return ret, should_abort or should_return - elif m and m.group('switch'): - switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + elif md.get('switch'): + switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) body, expr = self._separate_at_paren(remaining, '}') items = body.replace('default:', 'case default:').split('case ')[1:] @@ -480,6 +491,8 @@ def dict_item(key, val): break if matched: break + + if md: ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) return ret, should_abort or should_return @@ -584,7 +597,7 @@ def dict_item(key, val): member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] if arg_str.startswith('('): - arg_str, remaining = self._separate_at_paren(arg_str, ')') + arg_str, remaining = self._separate_at_paren(arg_str) else: arg_str, remaining = None, arg_str @@ -769,7 +782,7 @@ def extract_function_code(self, funcname): \((?P<args>[^)]*)\)\s* (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code'), '}') + code, _ = self._separate_at_paren(func_m.group('code')) if func_m is None: raise self.Exception(f'Could not find JS function "{funcname}"') return [x.strip() for x in func_m.group('args').split(',')], code @@ -784,7 +797,7 @@ def extract_function_from_code(self, argnames, code, *global_stack): if mobj is None: break start, body_start = mobj.span() - body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + body, remaining = self._separate_at_paren(code[body_start - 1:]) name = self._named_object(local_vars, self.extract_function_from_code( [x.strip() for x in mobj.group('args').split(',')], body, local_vars, *global_stack)) From d2c8aadf799a63aaa7da81ae03052b1ec2addd20 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 1 Sep 2022 16:49:03 +0530 Subject: [PATCH 120/284] [cleanup] Misc Closes #4710, Closes #4754, Closes #4723 Authored by: pukkandan, MrRawes, DavidH-2022 --- README.md | 45 +++++++++++++-------------------- devscripts/run_tests.sh | 8 +++--- test/test_YoutubeDL.py | 2 +- test/test_jsinterp.py | 2 +- yt_dlp/YoutubeDL.py | 10 ++++---- yt_dlp/__init__.py | 2 +- yt_dlp/cookies.py | 10 ++++++-- yt_dlp/extractor/_extractors.py | 45 +++++++++++++++++---------------- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/newspicks.py | 2 +- yt_dlp/extractor/triller.py | 2 +- yt_dlp/options.py | 8 +++--- 12 files changed, 67 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index c4667bb572..28fad2815c 100644 --- a/README.md +++ b/README.md @@ -321,7 +321,7 @@ ### Standalone PyInstaller Builds On some systems, you may need to use `py` or `python` instead of `python3`. -Note that pyinstaller [does not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment. +Note that pyinstaller with versions below 4.4 [do not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment. **Important**: Running `pyinstaller` directly **without** using `pyinst.py` is **not** officially supported. This may or may not work correctly. @@ -531,8 +531,8 @@ ## Video Selection: a file that is in the archive --break-on-reject Stop the download process when encountering a file that has been filtered out - --break-per-input Make --break-on-existing, --break-on-reject, - --max-downloads and autonumber reset per + --break-per-input --break-on-existing, --break-on-reject, + --max-downloads, and autonumber resets per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue @@ -1238,7 +1238,6 @@ # OUTPUT TEMPLATE - `id` (string): Video identifier - `title` (string): Video title - `fulltitle` (string): Video title ignoring live timestamp and generic title - - `url` (string): Video URL - `ext` (string): Video filename extension - `alt_title` (string): A secondary title of the video - `description` (string): The description of the video @@ -1273,26 +1272,6 @@ # OUTPUT TEMPLATE - `availability` (string): Whether the video is "private", "premium_only", "subscriber_only", "needs_auth", "unlisted" or "public" - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL - - `format` (string): A human-readable description of the format - - `format_id` (string): Format code specified by `--format` - - `format_note` (string): Additional info about the format - - `width` (numeric): Width of the video - - `height` (numeric): Height of the video - - `resolution` (string): Textual description of width and height - - `tbr` (numeric): Average bitrate of audio and video in KBit/s - - `abr` (numeric): Average audio bitrate in KBit/s - - `acodec` (string): Name of the audio codec in use - - `asr` (numeric): Audio sampling rate in Hertz - - `vbr` (numeric): Average video bitrate in KBit/s - - `fps` (numeric): Frame rate - - `dynamic_range` (string): The dynamic range of the video - - `audio_channels` (numeric): The number of audio channels - - `stretched_ratio` (float): `width:height` of the video's pixels, if not square - - `vcodec` (string): Name of the video codec in use - - `container` (string): Name of the container format - - `filesize` (numeric): The number of bytes, if known in advance - - `filesize_approx` (numeric): An estimate for the number of bytes - - `protocol` (string): The protocol that will be used for the actual download - `extractor` (string): Name of the extractor - `extractor_key` (string): Key name of the extractor - `epoch` (numeric): Unix epoch of when the information extraction was completed @@ -1311,6 +1290,8 @@ # OUTPUT TEMPLATE - `webpage_url_basename` (string): The basename of the webpage URL - `webpage_url_domain` (string): The domain of the webpage URL - `original_url` (string): The URL given by the user (or same as `webpage_url` for playlist entries) + +All the fields in [Filtering Formats](#filtering-formats) can also be used Available for the video that belongs to some logical chapter or section: @@ -1392,13 +1373,13 @@ #### Output template and Windows batch files #### Output template examples ```bash -$ yt-dlp --get-filename -o "test video.%(ext)s" BaW_jenozKc +$ yt-dlp --print filename -o "test video.%(ext)s" BaW_jenozKc test video.webm # Literal name with correct extension -$ yt-dlp --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc +$ yt-dlp --print filename -o "%(title)s.%(ext)s" BaW_jenozKc youtube-dl test video ''_ä↭𝕐.webm # All kinds of weird characters -$ yt-dlp --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filenames +$ yt-dlp --print filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filenames youtube-dl_test_video_.webm # Restricted file name # Download YouTube playlist videos in separate directory indexed by video order in a playlist @@ -1487,6 +1468,7 @@ ## Filtering Formats The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `>=`, `=` (equals), `!=` (not equals): - `filesize`: The number of bytes, if known in advance + - `filesize_approx`: An estimate for the number of bytes - `width`: Width of the video, if known - `height`: Height of the video, if known - `tbr`: Average bitrate of audio and video in KBit/s @@ -1494,16 +1476,23 @@ ## Filtering Formats - `vbr`: Average video bitrate in KBit/s - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate + - `audio_channels`: The number of audio channels + - `stretched_ratio`: `width:height` of the video's pixels, if not square Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains), `~=` (matches regex) and following string meta fields: + - `url`: Video URL - `ext`: File extension - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - - `format_id`: A short description of the format - `language`: Language code + - `dynamic_range`: The dynamic range of the video + - `format_id`: A short description of the format + - `format`: A human-readable description of the format + - `format_note`: Additional info about the format + - `resolution`: Textual description of width and height Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). The comparand of a string comparison needs to be quoted with either double or single quotes if it contains spaces or special characters other than `._-`. diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index d496a092b1..faa642e96c 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -1,13 +1,13 @@ #!/usr/bin/env sh -if [ -z $1 ]; then +if [ -z "$1" ]; then test_set='test' -elif [ $1 = 'core' ]; then +elif [ "$1" = 'core' ]; then test_set="-m not download" -elif [ $1 = 'download' ]; then +elif [ "$1" = 'download' ]; then test_set="-m download" else - echo 'Invalid test type "'$1'". Use "core" | "download"' + echo 'Invalid test type "'"$1"'". Use "core" | "download"' exit 1 fi diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 49dc2c198c..426e52305d 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -668,7 +668,7 @@ def test_add_extra_info(self): def test_prepare_outtmpl_and_filename(self): def test(tmpl, expected, *, info=None, **params): params['outtmpl'] = tmpl - ydl = YoutubeDL(params) + ydl = FakeYDL(params) ydl._num_downloads = 1 self.assertEqual(ydl.validate_outtmpl(tmpl), None) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b46d0949d4..92ef532f56 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -387,7 +387,7 @@ def test_regex(self): ''') self.assertEqual(jsi.call_function('x').flags & re.I, re.I) - jsi = JSInterpreter(''' + jsi = JSInterpreter(R''' function x() { let a=/,][}",],()}(\[)/; return a; } ''') self.assertEqual(jsi.call_function('x').pattern, r',][}",],()}(\[)') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 2b5b3fdfc5..a6bbbb1280 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1044,7 +1044,7 @@ def _parse_outtmpl(self): def get_output_path(self, dir_type='', filename=None): paths = self.params.get('paths', {}) - assert isinstance(paths, dict) + assert isinstance(paths, dict), '"paths" parameter must be a dictionary' path = os.path.join( expand_path(paths.get('home', '').strip()), expand_path(paths.get(dir_type, '').strip()) if dir_type else '', @@ -2745,9 +2745,9 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): if lang not in available_subs: available_subs[lang] = cap_info - if (not self.params.get('writesubtitles') and not - self.params.get('writeautomaticsub') or not - available_subs): + if not available_subs or ( + not self.params.get('writesubtitles') + and not self.params.get('writeautomaticsub')): return None all_sub_langs = tuple(available_subs.keys()) @@ -2764,7 +2764,7 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): else: requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1] if requested_langs: - self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs)) + self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}') formats_query = self.params.get('subtitlesformat', 'best') formats_preference = formats_query.split('/') if formats_query else [] diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 552f29bd96..356155fcdd 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -365,7 +365,7 @@ def parse_chapters(name, value): if keyring not in SUPPORTED_KEYRINGS: raise ValueError(f'unsupported keyring specified for cookies: "{keyring}". ' f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') - opts.cookiesfrombrowser = (browser_name, profile or None, keyring, container or None) + opts.cookiesfrombrowser = (browser_name, profile, keyring, container) # MetadataParser def metadataparser_actions(f): diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 9100f46ac2..0ccd22947e 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -25,7 +25,13 @@ sqlite3, ) from .minicurses import MultilinePrinter, QuietMultilinePrinter -from .utils import Popen, YoutubeDLCookieJar, error_to_str, expand_path, try_call +from .utils import ( + Popen, + YoutubeDLCookieJar, + error_to_str, + expand_path, + try_call, +) CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -138,7 +144,7 @@ def _extract_firefox_cookies(profile, container, logger): containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): raise FileNotFoundError(f'could not read containers.json in {search_root}') - with open(containers_path, 'r') as containers: + with open(containers_path) as containers: identities = json.load(containers).get('identities', []) container_id = next((context.get('userContextId') for context in identities if container in ( context.get('name'), diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8368e9315b..82b701a5dd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1,5 +1,28 @@ # flake8: noqa: F401 +from .youtube import ( # Youtube is moved to the top to improve performance + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeNotificationsIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubeLivestreamEmbedIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeStoriesIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, +) + from .abc import ( ABCIE, ABCIViewIE, @@ -2191,28 +2214,6 @@ from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeClipIE, - YoutubeFavouritesIE, - YoutubeNotificationsIE, - YoutubeHistoryIE, - YoutubeTabIE, - YoutubeLivestreamEmbedIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeMusicSearchURLIE, - YoutubeSubscriptionsIE, - YoutubeStoriesIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeYtBeIE, - YoutubeYtUserIE, - YoutubeWatchLaterIE, -) from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b9d0305b4d..c76133d8f5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3874,7 +3874,7 @@ def extract_from_webpage(cls, ydl, url, webpage): def _extract_from_webpage(cls, url, webpage): for embed_url in orderedSet( cls._extract_embed_urls(url, webpage) or [], lazy=True): - yield cls.url_result(embed_url, cls) + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) @classmethod def _extract_embed_urls(cls, url, webpage): diff --git a/yt_dlp/extractor/newspicks.py b/yt_dlp/extractor/newspicks.py index 0232d53570..a368ce4e02 100644 --- a/yt_dlp/extractor/newspicks.py +++ b/yt_dlp/extractor/newspicks.py @@ -5,7 +5,7 @@ class NewsPicksIE(InfoExtractor): - _VALID_URL = r'https://newspicks.com/movie-series/(?P<channel_id>\d+)\?movieId=(?P<id>\d+)' + _VALID_URL = r'https://newspicks\.com/movie-series/(?P<channel_id>\d+)\?movieId=(?P<id>\d+)' _TESTS = [{ 'url': 'https://newspicks.com/movie-series/11?movieId=1813', diff --git a/yt_dlp/extractor/triller.py b/yt_dlp/extractor/triller.py index c199da91da..e4123f8091 100644 --- a/yt_dlp/extractor/triller.py +++ b/yt_dlp/extractor/triller.py @@ -3,13 +3,13 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, str_or_none, traverse_obj, unified_strdate, unified_timestamp, url_basename, - ExtractorError, ) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 0fbf1f028b..4aa0acfbc5 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -442,9 +442,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'allowed_values': { 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', - 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', - 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', - 'no-youtube-prefer-utc-upload-date' + 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', + 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', + 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', }, 'aliases': { 'youtube-dl': ['all', '-multistreams'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], @@ -634,7 +634,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Make --break-on-existing, --break-on-reject, --max-downloads and autonumber reset per input URL') + help='--break-on-existing, --break-on-reject, --max-downloads, and autonumber resets per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', From 5d7c7d65698c7bfb281926181e7824989f1a236f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 1 Sep 2022 16:24:21 +0530 Subject: [PATCH 121/284] Release 2022.09.01 --- CONTRIBUTORS | 9 +++++++++ Changelog.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++ supportedsites.md | 9 +++++++++ 3 files changed, 66 insertions(+) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index eaf3450405..8bede1efd4 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -299,3 +299,12 @@ bashonly jacobtruman masta79 palewire +cgrigis +DavidH-2022 +dfaker +jackyyf +ohaiibuzzle +SamantazFox +shreyasminocha +tejasa97 +xenov diff --git a/Changelog.md b/Changelog.md index 5d72db7d0b..561b88ce63 100644 --- a/Changelog.md +++ b/Changelog.md @@ -11,6 +11,54 @@ # Instuctions for creating release --> +### 2022.09.01 + +* Add option `--use-extractors` +* Merge youtube-dl: Upto [commit/ed5c44e](https://github.com/ytdl-org/youtube-dl/commit/ed5c44e7) +* Add yt-dlp version to infojson +* Fix `--break-per-url --max-downloads` +* Fix bug in `--alias` +* [cookies] Support firefox container in `--cookies-from-browser` by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [downloader/external] Smarter detection of executable +* [extractor/generic] Don't return JW player without formats +* [FormatSort] Fix `aext` for `--prefer-free-formats` +* [jsinterp] Various improvements by [pukkandan](https://github.com/pukkandan), [dirkf](https://github.com/dirkf), [elyse0](https://github.com/elyse0) +* [cache] Mechanism to invalidate old cache +* [utils] Add `deprecation_warning` +* [utils] Add `orderedSet_from_options` +* [utils] `Popen`: Restore `LD_LIBRARY_PATH` when using PyInstaller by [Lesmiscore](https://github.com/Lesmiscore) +* [build] `make tar` should not follow `DESTDIR` by [satan1st](https://github.com/satan1st) +* [build] Update pyinstaller by [shirt-dev](https://github.com/shirt-dev) +* [test] Fix `test_youtube_signature` +* [cleanup] Misc fixes and cleanup by [DavidH-2022](https://github.com/DavidH-2022), [MrRawes](https://github.com/MrRawes), [pukkandan](https://github.com/pukkandan) +* [extractor/epoch] Add extractor by [tejasa97](https://github.com/tejasa97) +* [extractor/eurosport] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/IslamChannel] Add extractors by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/newspicks] Add extractor by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/triller] Add extractor by [bashonly](https://github.com/bashonly) +* [extractor/VQQ] Add extractors by [elyse0](https://github.com/elyse0) +* [extractor/youtube] Improvements to nsig extraction +* [extractor/youtube] Fix bug in format sorting +* [extractor/youtube] Update iOS Innertube clients by [SamantazFox](https://github.com/SamantazFox) +* [extractor/youtube] Use device-specific user agent by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube] Add `--compat-option no-youtube-prefer-utc-upload-date` by [coletdjnz](https://github.com/coletdjnz) +* [extractor/arte] Bug fix by [cgrigis](https://github.com/cgrigis) +* [extractor/bilibili] Extract `flac` with premium account by [jackyyf](https://github.com/jackyyf) +* [extractor/BiliBiliSearch] Don't sort by date +* [extractor/BiliBiliSearch] Fix infinite loop +* [extractor/bitchute] Mark errors as expected +* [extractor/crunchyroll:beta] Use anonymous access by [tejing1](https://github.com/tejing1) +* [extractor/huya] Fix stream extraction by [ohaiibuzzle](https://github.com/ohaiibuzzle) +* [extractor/medaltv] Fix extraction by [xenova](https://github.com/xenova) +* [extractor/mediaset] Fix embed extraction +* [extractor/mixcloud] All formats are audio-only +* [extractor/rtbf] Fix jwt extraction by [elyse0](https://github.com/elyse0) +* [extractor/screencastomatic] Support `--video-password` by [shreyasminocha](https://github.com/shreyasminocha) +* [extractor/stripchat] Don't modify input URL by [dfaker](https://github.com/dfaker) +* [extractor/uktv] Improve `_VALID_URL` by [dirkf](https://github.com/dirkf) +* [extractor/vimeo:user] Fix `_VALID_URL` + + ### 2022.08.19 * Fix bug in `--download-archive` diff --git a/supportedsites.md b/supportedsites.md index c115c00e36..d98863315d 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -364,6 +364,7 @@ # Supported sites - **Engadget** - **Epicon** - **EpiconSeries** + - **Epoch** - **Eporner** - **EroProfile**: [<abbr title="netrc machine"><em>eroprofile</em></abbr>] - **EroProfile:album** @@ -377,6 +378,7 @@ # Supported sites - **EsriVideo** - **Europa** - **EuropeanTour** + - **Eurosport** - **EUScreen** - **EWETV**: [<abbr title="netrc machine"><em>ewetv</em></abbr>] - **EWETVLive**: [<abbr title="netrc machine"><em>ewetv</em></abbr>] @@ -553,6 +555,8 @@ # Supported sites - **iq.com**: International version of iQiyi - **iq.com:album** - **iqiyi**: [<abbr title="netrc machine"><em>iqiyi</em></abbr>] 爱奇艺 + - **IslamChannel** + - **IslamChannelSeries** - **ITProTV** - **ITProTVCourse** - **ITTF** @@ -820,6 +824,7 @@ # Supported sites - **Newgrounds** - **Newgrounds:playlist** - **Newgrounds:user** + - **NewsPicks** - **Newstube** - **Newsy** - **NextMedia**: 蘋果日報 @@ -1331,6 +1336,8 @@ # Supported sites - **ToypicsUser**: Toypics user profile - **TrailerAddict**: (**Currently broken**) - **TravelChannel** + - **Triller**: [<abbr title="netrc machine"><em>triller</em></abbr>] + - **TrillerUser**: [<abbr title="netrc machine"><em>triller</em></abbr>] - **Trilulilu** - **Trovo** - **TrovoChannelClip**: All Clips of a trovo.live channel; "trovoclip:" prefix @@ -1506,6 +1513,8 @@ # Supported sites - **VoxMedia** - **VoxMediaVolume** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **vqq:series** + - **vqq:video** - **Vrak** - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - **VrtNU**: [<abbr title="netrc machine"><em>vrtnu</em></abbr>] VrtNU.be From adba24d2079d350fc03226adff3cae919d7a11db Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Thu, 1 Sep 2022 11:26:07 +0000 Subject: [PATCH 122/284] [version] update Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++++---- .github/ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++++---- yt_dlp/version.py | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 6f03f6e585..b77a5c8070 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -62,7 +62,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -70,8 +70,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.19, Current version: 2022.08.19 - yt-dlp is up to date (2022.08.19) + Latest version: 2022.09.01, Current version: 2022.09.01 + yt-dlp is up to date (2022.09.01) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 7904889a5b..39d5ec8cce 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -74,7 +74,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -82,8 +82,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.19, Current version: 2022.08.19 - yt-dlp is up to date (2022.08.19) + Latest version: 2022.09.01, Current version: 2022.09.01 + yt-dlp is up to date (2022.09.01) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 7d1f337322..a3a786e387 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -70,7 +70,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -78,8 +78,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.19, Current version: 2022.08.19 - yt-dlp is up to date (2022.08.19) + Latest version: 2022.09.01, Current version: 2022.09.01 + yt-dlp is up to date (2022.09.01) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index da68f4517a..79b3849492 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -55,7 +55,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -63,8 +63,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.19, Current version: 2022.08.19 - yt-dlp is up to date (2022.08.19) + Latest version: 2022.09.01, Current version: 2022.09.01 + yt-dlp is up to date (2022.09.01) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 4fbda845fb..0eaee4441b 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -51,7 +51,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -59,7 +59,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.19, Current version: 2022.08.19 - yt-dlp is up to date (2022.08.19) + Latest version: 2022.09.01, Current version: 2022.09.01 + yt-dlp is up to date (2022.09.01) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index c51ed1b9cc..acfbeb74b9 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.08.19** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.08.19 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,7 +65,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.08.19, Current version: 2022.08.19 - yt-dlp is up to date (2022.08.19) + Latest version: 2022.09.01, Current version: 2022.09.01 + yt-dlp is up to date (2022.09.01) <more lines> render: shell diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 8bfe0a09b4..ac7a825eae 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.08.19.2' +__version__ = '2022.09.01' -RELEASE_GIT_HEAD = '48c88e088' +RELEASE_GIT_HEAD = '5d7c7d656' VARIANT = None From 7c6eb424d35e51c81f8fe9e1eb7cc18067c3a8a7 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 2 Sep 2022 01:28:56 +0530 Subject: [PATCH 123/284] [extractor/youtube] Detect `lazy-load-for-videos` embeds Closes #4812 --- yt_dlp/extractor/youtube.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2748b5dc52..4a5d6805e9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -923,19 +923,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:\#|$)""" % { 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } - _EMBED_REGEX = [r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1'''] + _EMBED_REGEX = [ + r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', + # https://wordpress.org/plugins/lazy-load-for-videos/ + r'''(?xs) + <a\s[^>]*\bhref="(?P<url>https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})" + \s[^>]*\bclass="[^"]*\blazy-load-youtube''', + ] + _PLAYER_INFO_RE = ( r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', From 2c475e48b54b071a3e59441829b6dec7d5b3c0ac Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 2 Sep 2022 01:38:21 +0530 Subject: [PATCH 124/284] [extractor/bandcamp] Extract `uploader_url` Closes #4755 --- yt_dlp/extractor/bandcamp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index b34fcb1081..2dae49e770 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -21,7 +21,7 @@ class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"'] _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', @@ -85,7 +85,7 @@ def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): attr + ' data', group=2), video_id, fatal=fatal) def _real_extract(self, url): - title = self._match_id(url) + title, uploader = self._match_valid_url(url).group('id', 'uploader') webpage = self._download_webpage(url, title) tralbum = self._extract_data_attr(webpage, title) thumbnail = self._og_search_thumbnail(webpage) @@ -197,6 +197,8 @@ def _real_extract(self, url): 'title': title, 'thumbnail': thumbnail, 'uploader': artist, + 'uploader_id': uploader, + 'uploader_url': f'https://{uploader}.bandcamp.com', 'timestamp': timestamp, 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, From 5469a4ab117448c77ebd660cedd012ec2975d289 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 2 Sep 2022 01:51:04 +0530 Subject: [PATCH 125/284] [extractor/motorsport] Support native embeds Closes #4749 --- yt_dlp/extractor/motorsport.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/motorsport.py b/yt_dlp/extractor/motorsport.py index b292aeb9a1..efb087d035 100644 --- a/yt_dlp/extractor/motorsport.py +++ b/yt_dlp/extractor/motorsport.py @@ -31,8 +31,13 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) iframe_path = self._html_search_regex( - r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, - 'iframe path') + r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, 'iframe path', default=None) + + if iframe_path is None: + iframe_path = self._html_search_regex( + r'<iframe [^>]*\bsrc="(https://motorsport\.tv/embed/[^"]+)', webpage, 'embed iframe path') + return self.url_result(iframe_path) + iframe = self._download_webpage( compat_urlparse.urljoin(url, iframe_path), display_id, 'Downloading iframe') From d6f8871964253373ddaae60c89f1f4838769e7df Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Sep 2022 22:08:05 +0000 Subject: [PATCH 126/284] [extractor/triller] Fix auth token (#4813) Authored by: bashonly --- yt_dlp/extractor/triller.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/triller.py b/yt_dlp/extractor/triller.py index e4123f8091..2d633ca67d 100644 --- a/yt_dlp/extractor/triller.py +++ b/yt_dlp/extractor/triller.py @@ -15,11 +15,11 @@ class TrillerBaseIE(InfoExtractor): _NETRC_MACHINE = 'triller' - _AUTH_TOKEN = None _API_BASE_URL = 'https://social.triller.co/v1.5' + _API_HEADERS = {'Origin': 'https://triller.co'} def _perform_login(self, username, password): - if self._AUTH_TOKEN: + if self._API_HEADERS.get('Authorization'): return user_check = self._download_json( @@ -46,13 +46,13 @@ def _perform_login(self, username, password): raise ExtractorError('Unable to login: Incorrect password', expected=True) raise ExtractorError('Unable to login') - self._AUTH_TOKEN = login['auth_token'] + self._API_HEADERS['Authorization'] = f'Bearer {login["auth_token"]}' def _get_comments(self, video_id, limit=15): comment_info = self._download_json( f'{self._API_BASE_URL}/api/videos/{video_id}/comments_v2', video_id, fatal=False, note='Downloading comments API JSON', - headers={'Origin': 'https://triller.co'}, query={'limit': limit}) or {} + headers=self._API_HEADERS, query={'limit': limit}) or {} if not comment_info.get('comments'): return for comment_dict in comment_info['comments']: @@ -210,9 +210,7 @@ def _real_extract(self, url): f'{self._API_BASE_URL}/api/videos/{video_uuid}', video_uuid, note='Downloading video info API JSON', errnote='Unable to download video info API JSON', - headers={ - 'Origin': 'https://triller.co', - }), ('videos', 0)) + headers=self._API_HEADERS), ('videos', 0)) if not video_info: raise ExtractorError('No video info found in API response') @@ -242,19 +240,17 @@ class TrillerUserIE(TrillerBaseIE): }] def _real_initialize(self): - if not self._AUTH_TOKEN: + if not self._API_HEADERS.get('Authorization'): guest = self._download_json( f'{self._API_BASE_URL}/user/create_guest', - None, note='Creating guest session', data=b'', headers={ - 'Origin': 'https://triller.co', - }, query={ + None, note='Creating guest session', data=b'', headers=self._API_HEADERS, query={ 'platform': 'Web', 'app_version': '', }) if not guest.get('auth_token'): raise ExtractorError('Unable to fetch required auth token for user extraction') - self._AUTH_TOKEN = guest['auth_token'] + self._API_HEADERS['Authorization'] = f'Bearer {guest["auth_token"]}' def _extract_video_list(self, username, user_id, limit=6): query = { @@ -266,10 +262,8 @@ def _extract_video_list(self, username, user_id, limit=6): video_list = self._download_json( f'{self._API_BASE_URL}/api/users/{user_id}/videos', username, note=f'Downloading user video list page {page}', - errnote='Unable to download user video list', headers={ - 'Authorization': f'Bearer {self._AUTH_TOKEN}', - 'Origin': 'https://triller.co', - }, query=query) + errnote='Unable to download user video list', headers=self._API_HEADERS, + query=query) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: retry.error = e @@ -291,10 +285,7 @@ def _real_extract(self, url): user_info = self._check_user_info(self._download_json( f'{self._API_BASE_URL}/api/users/by_username/{username}', username, note='Downloading user info', - errnote='Failed to download user info', headers={ - 'Authorization': f'Bearer {self._AUTH_TOKEN}', - 'Origin': 'https://triller.co', - }).get('user', {})) + errnote='Failed to download user info', headers=self._API_HEADERS).get('user', {})) user_id = str_or_none(user_info.get('user_id')) videos = self._extract_video_list(username, user_id) From 3c7a2762343280d0e749acffd0edcf72fa4d0661 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Fri, 2 Sep 2022 15:51:12 +0900 Subject: [PATCH 127/284] [extractor/amazonstore] Retry to avoid captcha page (#4811) Authored by: Lesmiscore --- yt_dlp/extractor/amazon.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py index de4917adcc..56a8d844ac 100644 --- a/yt_dlp/extractor/amazon.py +++ b/yt_dlp/extractor/amazon.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ExtractorError, int_or_none class AmazonStoreIE(InfoExtractor): @@ -38,8 +38,14 @@ class AmazonStoreIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + + for retry in self.RetryManager(fatal=True): + webpage = self._download_webpage(url, id) + try: + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + except ExtractorError as e: + retry.error = e + entries = [{ 'id': video['marketPlaceID'], 'url': video['url'], From 1a7c9fad9f89b8994911c7d83f012da5f1aef445 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 2 Sep 2022 20:41:39 +0530 Subject: [PATCH 128/284] [jsinterp] Workaround operator associativity issue https://github.com/yt-dlp/yt-dlp/issues/4635#issuecomment-1235384480 --- test/test_youtube_signature.py | 4 ++++ yt_dlp/jsinterp.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index b1c5cb2b35..c3dcb4d68f 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -126,6 +126,10 @@ 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', 'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA', ), + ( + 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', + 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 2bb4acf3e7..4caad6f743 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -117,8 +117,8 @@ def _js_ternary(cndn, if_true=True, if_false=False): '-': _js_arith_op(operator.sub), '*': _js_arith_op(operator.mul), - '/': _js_div, '%': _js_mod, + '/': _js_div, '**': _js_exp, } From a12d03e15dc0d7ea1192dda77c389132a6a4e5d8 Mon Sep 17 00:00:00 2001 From: TokyoBlackHole <93612363+TokyoBlackHole@users.noreply.github.com> Date: Sat, 3 Sep 2022 00:11:25 +0200 Subject: [PATCH 129/284] [extractor/animeondemand] Remove extractor (#4830) Authored by: TokyoBlackHole --- supportedsites.md | 1 - yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/animeondemand.py | 282 ------------------------------ 3 files changed, 284 deletions(-) delete mode 100644 yt_dlp/extractor/animeondemand.py diff --git a/supportedsites.md b/supportedsites.md index d98863315d..7b1e72016b 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -66,7 +66,6 @@ # Supported sites - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Angel** - **AnimalPlanet** - - **AnimeOnDemand**: [<abbr title="netrc machine"><em>animeondemand</em></abbr>] - **ant1newsgr:article**: ant1news.gr articles - **ant1newsgr:embed**: ant1news.gr embedded videos - **ant1newsgr:watch**: ant1news.gr videos diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 82b701a5dd..e031cecaa3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -84,7 +84,6 @@ AmericasTestKitchenSeasonIE, ) from .angel import AngelIE -from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE diff --git a/yt_dlp/extractor/animeondemand.py b/yt_dlp/extractor/animeondemand.py deleted file mode 100644 index de49db4ea7..0000000000 --- a/yt_dlp/extractor/animeondemand.py +++ /dev/null @@ -1,282 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - extract_attributes, - ExtractorError, - join_nonempty, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class AnimeOnDemandIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)' - _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' - _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' - _NETRC_MACHINE = 'animeondemand' - # German-speaking countries of Europe - _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] - _TESTS = [{ - # jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/161', - 'info_dict': { - 'id': '161', - 'title': 'Grimgar, Ashes and Illusions (OmU)', - 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', - }, - 'playlist_mincount': 4, - }, { - # Film wording is used instead of Episode, ger/jap, Dub/OmU - 'url': 'https://www.anime-on-demand.de/anime/39', - 'only_matching': True, - }, { - # Episodes without titles, jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/162', - 'only_matching': True, - }, { - # ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/169', - 'only_matching': True, - }, { - # Full length film, non-series, ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/185', - 'only_matching': True, - }, { - # Flash videos - 'url': 'https://www.anime-on-demand.de/anime/12', - 'only_matching': True, - }] - - def _perform_login(self, username, password): - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: - self.raise_geo_restricted( - '%s is only available in German-speaking countries of Europe' % self.IE_NAME) - - login_form = self._form_hidden_inputs('new_user', login_page) - - login_form.update({ - 'user[login]': username, - 'user[password]': password, - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Referer': self._LOGIN_URL, - }) - - if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): - error = self._search_regex( - r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>', - response, 'error', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - def _real_extract(self, url): - anime_id = self._match_id(url) - - webpage = self._download_webpage(url, anime_id) - - if 'data-playlist=' not in webpage: - self._download_webpage( - self._APPLY_HTML5_URL, anime_id, - 'Activating HTML5 beta', 'Unable to apply HTML5 beta') - webpage = self._download_webpage(url, anime_id) - - csrf_token = self._html_search_meta( - 'csrf-token', webpage, 'csrf token', fatal=True) - - anime_title = self._html_search_regex( - r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>', - webpage, 'anime name') - anime_description = self._html_search_regex( - r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', - webpage, 'anime description', default=None) - - def extract_info(html, video_id, num=None): - title, description = [None] * 2 - formats = [] - - for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): - attributes = extract_attributes(input_) - title = attributes.get('data-dialog-header') - playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): - playlist_url = attributes.get(playlist_key) - if isinstance(playlist_url, compat_str) and re.match( - r'/?[\da-zA-Z]+', playlist_url): - playlist_urls.append(attributes[playlist_key]) - if not playlist_urls: - continue - - lang = attributes.get('data-lang') - lang_note = attributes.get('value') - - for playlist_url in playlist_urls: - kind = self._search_regex( - r'videomaterialurl/\d+/([^/]+)/', - playlist_url, 'media kind', default=None) - format_id = join_nonempty(lang, kind) if lang or kind else str(num) - format_note = join_nonempty(kind, lang_note, delim=', ') - item_id_list = [] - if format_id: - item_id_list.append(format_id) - item_id_list.append('videomaterial') - playlist = self._download_json( - urljoin(url, playlist_url), video_id, - 'Downloading %s JSON' % ' '.join(item_id_list), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }, fatal=False) - if not playlist: - continue - stream_url = url_or_none(playlist.get('streamurl')) - if stream_url: - rtmp = re.search( - r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', - stream_url) - if rtmp: - formats.append({ - 'url': rtmp.group('url'), - 'app': rtmp.group('app'), - 'play_path': rtmp.group('playpath'), - 'page_url': url, - 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', - 'rtmp_real_time': True, - 'format_id': 'rtmp', - 'ext': 'flv', - }) - continue - start_video = playlist.get('startvideo', 0) - playlist = playlist.get('playlist') - if not playlist or not isinstance(playlist, list): - continue - playlist = playlist[start_video] - title = playlist.get('title') - if not title: - continue - description = playlist.get('description') - for source in playlist.get('sources', []): - file_ = source.get('file') - if not file_: - continue - ext = determine_ext(file_) - format_id = join_nonempty( - lang, kind, - 'hls' if ext == 'm3u8' else None, - 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) - if ext == 'm3u8': - file_formats = self._extract_m3u8_formats( - file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) - elif source.get('type') == 'video/dash' or ext == 'mpd': - continue - file_formats = self._extract_mpd_formats( - file_, video_id, mpd_id=format_id, fatal=False) - else: - continue - for f in file_formats: - f.update({ - 'language': lang, - 'format_note': format_note, - }) - formats.extend(file_formats) - - return { - 'title': title, - 'description': description, - 'formats': formats, - } - - def extract_entries(html, video_id, common_info, num=None): - info = extract_info(html, video_id, num) - - if info['formats']: - self._sort_formats(info['formats']) - f = common_info.copy() - f.update(info) - yield f - - # Extract teaser/trailer only when full episode is not available - if not info['formats']: - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', - html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-%s' % (f['id'], m.group('kind').lower()), - 'title': m.group('title'), - 'url': urljoin(url, m.group('href')), - }) - yield f - - def extract_episodes(html): - for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - - for e in extract_entries(episode_html, video_id, common_info): - yield e - - def extract_film(html, video_id): - common_info = { - 'id': anime_id, - 'title': anime_title, - 'description': anime_description, - } - for e in extract_entries(html, video_id, common_info): - yield e - - def entries(): - has_episodes = False - for e in extract_episodes(webpage): - has_episodes = True - yield e - - if not has_episodes: - for e in extract_film(webpage, anime_id): - yield e - - return self.playlist_result( - entries(), anime_id, anime_title, anime_description) From aa824dd10bb645784e2fbf1470e27d3723322fcb Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 3 Sep 2022 03:19:48 +0000 Subject: [PATCH 130/284] [extractor/mediaworksnzvod] Add extractor (#4817) Authored by: coletdjnz --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mediaworksnz.py | 105 +++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 yt_dlp/extractor/mediaworksnz.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e031cecaa3..aedf063f66 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -937,6 +937,7 @@ MediasiteCatalogIE, MediasiteNamedCatalogIE, ) +from .mediaworksnz import MediaWorksNZVODIE from .medici import MediciIE from .megaphone import MegaphoneIE from .meipai import MeipaiIE diff --git a/yt_dlp/extractor/mediaworksnz.py b/yt_dlp/extractor/mediaworksnz.py new file mode 100644 index 0000000000..651239bd49 --- /dev/null +++ b/yt_dlp/extractor/mediaworksnz.py @@ -0,0 +1,105 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + bug_reports_message, + float_or_none, + traverse_obj, + unified_timestamp, +) + + +class MediaWorksNZVODIE(InfoExtractor): + _VALID_URL_BASE_RE = r'https?://vodupload-api\.mediaworks\.nz/library/asset/published/' + _VALID_URL_ID_RE = r'(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = rf'{_VALID_URL_BASE_RE}{_VALID_URL_ID_RE}' + _TESTS = [{ + 'url': 'https://vodupload-api.mediaworks.nz/library/asset/published/VID00359', + 'info_dict': { + 'id': 'VID00359', + 'ext': 'mp4', + 'title': 'GRG Jacinda Ardern safe drug testing 1920x1080', + 'description': 'md5:d4d7dc366742e86d8130b257dcb520ba', + 'duration': 142.76, + 'timestamp': 1604268608, + 'upload_date': '20201101', + 'thumbnail': r're:^https?://.*\.jpg$', + 'channel': 'George FM' + } + }, { + # has audio-only format + 'url': 'https://vodupload-api.mediaworks.nz/library/asset/published/VID02627', + 'info_dict': { + 'id': 'VID02627', + 'ext': 'mp3', + 'title': 'Tova O\'Brien meets Ukraine President Volodymyr Zelensky', + 'channel': 'Today FM', + 'description': 'Watch in full the much anticipated interview of Volodymyr Zelensky', + 'duration': 2061.16, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20220822', + 'timestamp': 1661152289, + }, + 'params': {'format': 'ba[ext=mp3]'} + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://www.rova.nz/home/podcasts/socrates-walks-into-a-bar/the-trolley-problem---episode-1.html', + 'info_dict': { + 'id': 'VID02494', + 'ext': 'mp4', + 'title': 'The Trolley Problem', + 'duration': 2843.56, + 'channel': 'Other', + 'timestamp': 1658356489, + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Socrates Walks Into A Bar Podcast Episode 1', + 'upload_date': '20220720', + } + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for mobj in re.finditer( + rf'''(?x)<div\s+\bid=["']Player-Attributes-JWID[^>]+\b + data-request-url=["']{cls._VALID_URL_BASE_RE}["'][^>]+\b + data-asset-id=["']{cls._VALID_URL_ID_RE}["']''', webpage + ): + yield f'https://vodupload-api.mediaworks.nz/library/asset/published/{mobj.group("id")}' + + def _real_extract(self, url): + video_id = self._match_id(url) + asset = self._download_json(url, video_id)['asset'] + + if asset.get('drm') not in ('NonDRM', None): + self.report_drm(video_id) + + content_type = asset.get('type') + if content_type and content_type != 'video': + self.report_warning(f'Unknown content type: {content_type}' + bug_reports_message(), video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['streamingUrl'], video_id) + + audio_streaming_url = traverse_obj( + asset, 'palyoutPathAudio', 'playoutpathaudio', expected_type=str) + if audio_streaming_url: + audio_formats = self._extract_m3u8_formats(audio_streaming_url, video_id, fatal=False, ext='mp3') + for audio_format in audio_formats: + # all the audio streams appear to be aac + audio_format.setdefault('vcodec', 'none') + audio_format.setdefault('acodec', 'aac') + formats.append(audio_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': asset.get('title'), + 'description': asset.get('description'), + 'duration': float_or_none(asset.get('duration')), + 'timestamp': unified_timestamp(asset.get('dateadded')), + 'channel': asset.get('brand'), + 'thumbnails': [{'url': thumbnail_url} for thumbnail_url in asset.get('thumbnails') or []], + 'formats': formats, + 'subtitles': subtitles, + } From 69082b38dcb8ba5c6050d86f592c899a0a71760f Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Sat, 3 Sep 2022 01:44:01 -0500 Subject: [PATCH 131/284] [phantomjs] Fix bug in 587021cd9f717181b44e881941aca3f8d753758b (#4833) Authored by: elyse0 --- yt_dlp/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index d2756a0061..56b8330ff8 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -212,7 +212,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w 'jscode': jscode, })) - stdout = self.execute(jscode, video_id, note2) + stdout = self.execute(jscode, video_id, note=note2) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') From 07a1250e0e90515ff8142161536f9dafa6eaba1b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 3 Sep 2022 17:56:23 +0530 Subject: [PATCH 132/284] [outtmpl] Curly braces to filter keys --- README.md | 2 +- test/test_YoutubeDL.py | 13 ++++++++++++- yt_dlp/YoutubeDL.py | 40 +++++++++++++++++++++++++++------------- yt_dlp/utils.py | 13 ++++++++++--- 4 files changed, 50 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 28fad2815c..4a5456f97e 100644 --- a/README.md +++ b/README.md @@ -1210,7 +1210,7 @@ # OUTPUT TEMPLATE The field names themselves (the part inside the parenthesis) can also have some special formatting: -1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a `.` (dot) separator. You can also do python slicing using `:`. E.g. `%(tags.0)s`, `%(subtitles.en.-1.ext)s`, `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. `%()s` refers to the entire infodict. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields +1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a dot `.` separator; e.g. `%(tags.0)s`, `%(subtitles.en.-1.ext)s`. You can do Python slicing with colon `:`; E.g. `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. Curly braces `{}` can be used to build dictionaries with only specific keys; e.g. `%(formats.:.{format_id,height})#j`. An empty field name `%()s` refers to the entire infodict; e.g. `%(.{id,title})s`. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields 1. **Addition**: Addition and subtraction of numeric fields can be done using `+` and `-` respectively. E.g. `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d` diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 426e52305d..60e4571084 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -662,7 +662,11 @@ def test_add_extra_info(self): 'playlist_autonumber': 2, '__last_playlist_index': 100, 'n_entries': 10, - 'formats': [{'id': 'id 1'}, {'id': 'id 2'}, {'id': 'id 3'}] + 'formats': [ + {'id': 'id 1', 'height': 1080, 'width': 1920}, + {'id': 'id 2', 'height': 720}, + {'id': 'id 3'} + ] } def test_prepare_outtmpl_and_filename(self): @@ -729,6 +733,7 @@ def test(tmpl, expected, *, info=None, **params): self.assertTrue(isinstance(YoutubeDL.validate_outtmpl('%(title)'), ValueError)) test('%(invalid@tmpl|def)s', 'none', outtmpl_na_placeholder='none') test('%(..)s', 'NA') + test('%(formats.{id)s', 'NA') # Entire info_dict def expect_same_infodict(out): @@ -813,6 +818,12 @@ def expect_same_infodict(out): test('%(formats.:2:-1)r', repr(FORMATS[:2:-1])) test('%(formats.0.id.-1+id)f', '1235.000000') test('%(formats.0.id.-1+formats.1.id.-1)d', '3') + out = json.dumps([{'id': f['id'], 'height.:2': str(f['height'])[:2]} + if 'height' in f else {'id': f['id']} + for f in FORMATS]) + test('%(formats.:.{id,height.:2})j', (out, sanitize(out))) + test('%(formats.:.{id,height}.id)l', ', '.join(f['id'] for f in FORMATS)) + test('%(.{id,title})j', ('{"id": "1234"}', '{"id": "1234"}')) # Alternates test('%(title,id)s', '1234') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a6bbbb1280..58c5c47501 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1127,8 +1127,12 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): '-': float.__sub__, } # Field is of the form key1.key2... - # where keys (except first) can be string, int or slice - FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') + # where keys (except first) can be string, int, slice or "{field, ...}" + FIELD_INNER_RE = r'(?:\w+|%(num)s|%(num)s?(?::%(num)s?){1,2})' % {'num': r'(?:-?\d+)'} + FIELD_RE = r'\w*(?:\.(?:%(inner)s|{%(field)s(?:,%(field)s)*}))*' % { + 'inner': FIELD_INNER_RE, + 'field': rf'\w*(?:\.{FIELD_INNER_RE})*' + } MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) INTERNAL_FORMAT_RE = re.compile(rf'''(?x) @@ -1142,11 +1146,20 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): (?:\|(?P<default>.*?))? )$''') - def _traverse_infodict(k): - k = k.split('.') - if k[0] == '': - k.pop(0) - return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True) + def _traverse_infodict(fields): + fields = [f for x in re.split(r'\.({.+?})\.?', fields) + for f in ([x] if x.startswith('{') else x.split('.'))] + for i in (0, -1): + if fields and not fields[i]: + fields.pop(i) + + for i, f in enumerate(fields): + if not f.startswith('{'): + continue + assert f.endswith('}'), f'No closing brace for {f} in {fields}' + fields[i] = {k: k.split('.') for k in f[1:-1].split(',')} + + return traverse_obj(info_dict, fields, is_user_input=True, traverse_string=True) def get_value(mdict): # Object traversal @@ -2800,12 +2813,13 @@ def _forceprint(self, key, info_dict): info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions')) def format_tmpl(tmpl): - mobj = re.match(r'\w+(=?)$', tmpl) - if mobj and mobj.group(1): - return f'{tmpl[:-1]} = %({tmpl[:-1]})r' - elif mobj: - return f'%({tmpl})s' - return tmpl + mobj = re.fullmatch(r'([\w.:,-]|(?P<dict>{[\w.:,-]+}))+=', tmpl) + if not mobj: + return tmpl + elif not mobj.group('dict'): + return '\n'.join(f'{f} = %({f})r' for f in tmpl[:-1].split(',')) + tmpl = f'.{tmpl[:-1]}' if tmpl.startswith('{') else tmpl[:-1] + return f'{tmpl} = %({tmpl})#j' for tmpl in self.params['forceprint'].get(key, []): self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 00f2fbf423..90042aa8b9 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5280,7 +5280,7 @@ def traverse_obj( @param path_list A list of paths which are checked one by one. Each path is a list of keys where each key is a: - None: Do nothing - - string: A dictionary key + - string: A dictionary key / regex group - int: An index into a list - tuple: A list of keys all of which will be traversed - Ellipsis: Fetch all values in the object @@ -5290,12 +5290,16 @@ def traverse_obj( @param expected_type Only accept final value of this type (Can also be any callable) @param get_all Return all the values obtained from a path or only the first one @param casesense Whether to consider dictionary keys as case sensitive + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API + + @param path_list In addition to the above, + - dict: Given {k:v, ...}; return {k: traverse_obj(obj, v), ...} @param is_user_input Whether the keys are generated from user input. If True, strings are converted to int/slice if necessary @param traverse_string Whether to traverse inside strings. If True, any non-compatible object will also be converted into a string - # TODO: Write tests - ''' + ''' # TODO: Write tests if not casesense: _lower = lambda k: (k.lower() if isinstance(k, str) else k) path_list = (map(_lower, variadic(path)) for path in path_list) @@ -5309,6 +5313,7 @@ def _traverse_obj(obj, path, _current_depth=0): if isinstance(key, (list, tuple)): obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key] key = ... + if key is ...: obj = (obj.values() if isinstance(obj, dict) else obj if isinstance(obj, (list, tuple, LazyList)) @@ -5316,6 +5321,8 @@ def _traverse_obj(obj, path, _current_depth=0): _current_depth += 1 depth = max(depth, _current_depth) return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj] + elif isinstance(key, dict): + obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()}) elif callable(key): if isinstance(obj, (list, tuple, LazyList)): obj = enumerate(obj) From 7657ec7ed6318dd66dd72cc100ba7bc5b911366e Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Sat, 3 Sep 2022 22:09:45 -0500 Subject: [PATCH 133/284] [utils] `base_url`: URL paths can contain `&` (#4841) Authored by: elyse0 Closes #4187 --- test/test_utils.py | 1 + yt_dlp/utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 67cd966d8e..96477c53fc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -566,6 +566,7 @@ def test_base_url(self): self.assertEqual(base_url('http://foo.de/bar/'), 'http://foo.de/bar/') self.assertEqual(base_url('http://foo.de/bar/baz'), 'http://foo.de/bar/') self.assertEqual(base_url('http://foo.de/bar/baz?x=z/x/c'), 'http://foo.de/bar/') + self.assertEqual(base_url('http://foo.de/bar/baz&x=z&w=y/x/c'), 'http://foo.de/bar/baz&x=z&w=y/x/') def test_urljoin(self): self.assertEqual(urljoin('http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 90042aa8b9..53939f2902 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2479,7 +2479,7 @@ def url_basename(url): def base_url(url): - return re.match(r'https?://[^?#&]+/', url).group() + return re.match(r'https?://[^?#]+/', url).group() def urljoin(base, path): From 48c8424bd9e03fdfd5c4c4495de233e896eb1f16 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 6 Sep 2022 19:56:56 +0530 Subject: [PATCH 134/284] Fix bug in 07a1250e0e90515ff8142161536f9dafa6eaba1b --- yt_dlp/YoutubeDL.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 58c5c47501..99db8be923 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2813,13 +2813,16 @@ def _forceprint(self, key, info_dict): info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions')) def format_tmpl(tmpl): - mobj = re.fullmatch(r'([\w.:,-]|(?P<dict>{[\w.:,-]+}))+=', tmpl) + mobj = re.fullmatch(r'([\w.:,]|-\d|(?P<dict>{([\w.:,]|-\d)+}))+=?', tmpl) if not mobj: return tmpl - elif not mobj.group('dict'): - return '\n'.join(f'{f} = %({f})r' for f in tmpl[:-1].split(',')) - tmpl = f'.{tmpl[:-1]}' if tmpl.startswith('{') else tmpl[:-1] - return f'{tmpl} = %({tmpl})#j' + + fmt = '%({})s' + if tmpl.startswith('{'): + tmpl = f'.{tmpl}' + if tmpl.endswith('='): + tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' + return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) for tmpl in self.params['forceprint'].get(key, []): self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) From be9c0884d7af01f9b658975a98a91d71c420d34f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 7 Sep 2022 17:28:53 +0530 Subject: [PATCH 135/284] [extractor/BiliIntlSeries] Fix `_VALID_URL` Closes #4825 --- yt_dlp/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 59f5791d1e..7e63dad0f1 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -975,7 +975,7 @@ def _real_extract(self, url): class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, From 17ffed184237b3686212cc73290e5cdd0f6f20ca Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 7 Sep 2022 17:35:45 +0530 Subject: [PATCH 136/284] [docs] Improvements * Move detailed installation instructions to https://github.com/yt-dlp/yt-dlp/wiki/Installation * Link to wiki where applicable * Fix some mistakes. Closes #4853, Closes #4855, Closes #4852 * Improve some error messages --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 2 +- .../ISSUE_TEMPLATE/2_site_support_request.yml | 2 +- .github/ISSUE_TEMPLATE/4_bug_report.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml | 2 +- .../2_site_support_request.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml | 2 +- CONTRIBUTING.md | 2 +- CONTRIBUTORS | 1 + README.md | 91 ++++--------------- yt_dlp/YoutubeDL.py | 31 ++++--- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/youtube.py | 13 ++- yt_dlp/options.py | 2 +- yt_dlp/utils.py | 2 +- 14 files changed, 55 insertions(+), 101 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index b77a5c8070..af0320569c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -22,7 +22,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 39d5ec8cce..55ee9d3b7e 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -22,7 +22,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge + - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 79b3849492..4613fd35d1 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -22,7 +22,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index 16efba5793..e1b1e51380 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -16,7 +16,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index 522eb751eb..12a1c65987 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -16,7 +16,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge + - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index fd966e8ca3..377efbe338 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -16,7 +16,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d9d5f47304..a8ac671dcf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -161,7 +161,7 @@ ## Adding new feature or making overarching changes ## Adding support for a new site -If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](https://www.github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. yt-dlp does **not support** such sites thus pull requests adding support for them **will be rejected**. +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#is-the-website-primarily-used-for-piracy)**. yt-dlp does **not support** such sites thus pull requests adding support for them **will be rejected**. After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 8bede1efd4..7859170568 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -308,3 +308,4 @@ SamantazFox shreyasminocha tejasa97 xenov +satan1st diff --git a/README.md b/README.md index 4a5456f97e..77e597ba01 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ * [CONTRIBUTING](CONTRIBUTING.md#contributing-to-yt-dlp) * [Opening an Issue](CONTRIBUTING.md#opening-an-issue) * [Developer Instructions](CONTRIBUTING.md#developer-instructions) -* [MORE](#more) +* [WIKI](https://github.com/yt-dlp/yt-dlp/wiki) <!-- MANPAGE: END EXCLUDED SECTION --> @@ -158,76 +158,26 @@ ### Differences in default behavior # INSTALLATION -You can install yt-dlp using one of the following methods: - -### Using the release binary - -You can simply download the [correct binary file](#release-files) for your OS - <!-- MANPAGE: BEGIN EXCLUDED SECTION --> [![Windows](https://img.shields.io/badge/-Windows_x64-blue.svg?style=for-the-badge&logo=windows)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe) -[![Linux](https://img.shields.io/badge/-Linux/BSD-red.svg?style=for-the-badge&logo=linux)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp) +[![Unix](https://img.shields.io/badge/-Linux/BSD-red.svg?style=for-the-badge&logo=linux)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp) [![MacOS](https://img.shields.io/badge/-MacOS-lightblue.svg?style=for-the-badge&logo=apple)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos) +[![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp) [![Source Tarball](https://img.shields.io/badge/-Source_tar-green.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) [![Other variants](https://img.shields.io/badge/-Other-grey.svg?style=for-the-badge)](#release-files) [![All versions](https://img.shields.io/badge/-All_Versions-lightgrey.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases) <!-- MANPAGE: END EXCLUDED SECTION --> -Note: The manpages, shell completion files etc. are available in the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) +You can install yt-dlp using [the binaries](#release-files), [PIP](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions -<!-- TODO: Move to Wiki --> -In UNIX-like OSes (MacOS, Linux, BSD), you can also install the same in one of the following ways: - -``` -sudo curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp -sudo chmod a+rx /usr/local/bin/yt-dlp -``` - -``` -sudo wget https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -O /usr/local/bin/yt-dlp -sudo chmod a+rx /usr/local/bin/yt-dlp -``` - -``` -sudo aria2c https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp --dir /usr/local/bin -o yt-dlp -sudo chmod a+rx /usr/local/bin/yt-dlp -``` - - -### With [PIP](https://pypi.org/project/pip) - -You can install the [PyPI package](https://pypi.org/project/yt-dlp) with: -``` -python3 -m pip install -U yt-dlp -``` - -You can install without any of the optional dependencies using: -``` -python3 -m pip install --no-deps -U yt-dlp -``` - -If you want to be on the cutting edge, you can also install the master branch with: -``` -python3 -m pip install --force-reinstall https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz -``` - -On some systems, you may need to use `py` or `python` instead of `python3` - -<!-- TODO: Add to Wiki, Remove Taps --> -### With [Homebrew](https://brew.sh) - -macOS or Linux users that are using Homebrew can also install it by: - -``` -brew install yt-dlp/taps/yt-dlp -``` ## UPDATE -You can use `yt-dlp -U` to update if you are [using the provided release](#using-the-release-binary) +You can use `yt-dlp -U` to update if you are [using the release binaries](#release-files) -If you [installed with pip](#with-pip), simply re-run the same command that was used to install the program +If you [installed with PIP](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program + +For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) or refer their documentation -If you [installed using Homebrew](#with-homebrew), run `brew upgrade yt-dlp/taps/yt-dlp` <!-- MANPAGE: BEGIN EXCLUDED SECTION --> ## RELEASE FILES @@ -256,11 +206,14 @@ #### Misc File|Description :---|:--- -[yt-dlp.tar.gz](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz)|Source tarball. Also contains manpages, completions, etc +[yt-dlp.tar.gz](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz)|Source tarball [SHA2-512SUMS](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-512SUMS)|GNU-style SHA512 sums [SHA2-256SUMS](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-256SUMS)|GNU-style SHA256 sums <!-- MANPAGE: END EXCLUDED SECTION --> + +Note: The manpages, shell completion files etc. are available in the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) + ## DEPENDENCIES Python versions 3.7+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. @@ -722,10 +675,10 @@ ## Filesystem Options: Currently supported keyrings are: basictext, gnomekeyring, kwallet --no-cookies-from-browser Do not load cookies from browser (default) - --cache-dir DIR Location in the filesystem where youtube-dl - can store some downloaded information (such - as client ids and signatures) permanently. - By default $XDG_CACHE_HOME/yt-dlp or + --cache-dir DIR Location in the filesystem where yt-dlp can + store some downloaded information (such as + client ids and signatures) permanently. By + default $XDG_CACHE_HOME/yt-dlp or ~/.cache/yt-dlp --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files @@ -1220,7 +1173,7 @@ # OUTPUT TEMPLATE 1. **Replacement**: A replacement value can specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. -1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-template`. E.g. `%(uploader|Unknown)s` +1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s` 1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, yt-dlp additionally supports converting to `B` = **B**ytes, `j` = **j**son (flag `#` for pretty-printing), `h` = HTML escaping, `l` = a comma separated **l**ist (flag `#` for `\n` newline-separated), `q` = a string **q**uoted for the terminal (flag `#` to split a list into different arguments), `D` = add **D**ecimal suffixes (e.g. 10M) (flag `#` to use 1024 as factor), and `S` = **S**anitize as filename (flag `#` for restricted) @@ -1364,12 +1317,6 @@ # OUTPUT TEMPLATE In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title. -<!-- MANPAGE: BEGIN EXCLUDED SECTION --> -#### Output template and Windows batch files - -If you are using an output template inside a Windows batch file then you must escape plain percent characters (`%`) by doubling, so that `-o "%(title)s-%(id)s.%(ext)s"` should become `-o "%%(title)s-%%(id)s.%%(ext)s"`. However you should not touch `%`'s that are not plain characters, e.g. environment variables for expansion should stay intact: `-o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s"`. -<!-- MANPAGE: END EXCLUDED SECTION --> - #### Output template examples ```bash @@ -2141,5 +2088,5 @@ #### Removed # CONTRIBUTING See [CONTRIBUTING.md](CONTRIBUTING.md#contributing-to-yt-dlp) for instructions on [Opening an Issue](CONTRIBUTING.md#opening-an-issue) and [Contributing code to the project](CONTRIBUTING.md#developer-instructions) -# MORE -For FAQ see the [youtube-dl README](https://github.com/ytdl-org/youtube-dl#faq) +# WIKI +See the [Wiki](https://github.com/yt-dlp/yt-dlp/wiki) for more information diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 99db8be923..a7b8813979 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -251,8 +251,8 @@ class YoutubeDL: matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. - logtostderr: Log messages to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. + logtostderr: Print everything to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove private fields from the infojson @@ -1419,18 +1419,19 @@ def add_extra_info(info_dict, extra_info): def extract_info(self, url, download=True, ie_key=None, extra_info=None, process=True, force_generic_extractor=False): """ - Return a list with a dictionary for each video extracted. + Extract and return the information dictionary of the URL Arguments: - url -- URL to extract + @param url URL to extract Keyword arguments: - download -- whether to download videos during extraction - ie_key -- extractor key hint - extra_info -- dictionary containing the extra values to add to each result - process -- whether to resolve all unresolved references (URLs, playlist items), - must be True for download to work. - force_generic_extractor -- force using the generic extractor + @param download Whether to download videos + @param process Whether to resolve all unresolved references (URLs, playlist items). + Must be True for download to work + @param ie_key Use only the extractor with this key + + @param extra_info Dictionary containing the extra values to add to the info (For internal use only) + @force_generic_extractor Force using the generic extractor (Deprecated; use ie_key='Generic') """ if extra_info is None: @@ -2525,11 +2526,11 @@ def sanitize_numeric_fields(info): info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None if not self.params.get('allow_unplayable_formats'): formats = [f for f in formats if not f.get('has_drm')] - if info_dict['_has_drm'] and formats and all( - f.get('acodec') == f.get('vcodec') == 'none' for f in formats): - self.report_warning( - 'This video is DRM protected and only images are available for download. ' - 'Use --list-formats to see them') + + if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats): + self.report_warning( + f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}' + 'only images are available for download. Use --list-formats to see them'.capitalize()) get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) if not get_from_start: diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c76133d8f5..02a4c6cec6 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -509,7 +509,7 @@ def _login_hint(self, method=NO_DEFAULT, netrc=None): 'password': f'Use {password_hint}', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' - 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), + 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'), }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies'] def __init__(self, downloader=None): diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4a5d6805e9..3ca189e449 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3336,10 +3336,15 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i if isinstance(e, JSInterpreter.Exception): phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') - self.report_warning( - f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' - f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) - self.write_debug(e, only_once=True) + if player_url: + self.report_warning( + f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' + f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) + self.write_debug(e, only_once=True) + else: + self.report_warning( + 'Cannot decrypt nsig without player_url: You may experience throttling for some formats', + video_id=video_id, only_once=True) throttled = True if itag: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 4aa0acfbc5..26392f6193 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1417,7 +1417,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not load cookies from browser (default)') filesystem.add_option( '--cache-dir', dest='cachedir', default=None, metavar='DIR', - help='Location in the filesystem where youtube-dl can store some downloaded information (such as client ids and signatures) permanently. By default $XDG_CACHE_HOME/yt-dlp or ~/.cache/yt-dlp') + help='Location in the filesystem where yt-dlp can store some downloaded information (such as client ids and signatures) permanently. By default $XDG_CACHE_HOME/yt-dlp or ~/.cache/yt-dlp') filesystem.add_option( '--no-cache-dir', action='store_false', dest='cachedir', help='Disable filesystem caching') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 53939f2902..06699341c9 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1610,7 +1610,7 @@ def prepare_line(line): if f'{line.strip()} '[0] in '[{"': raise http.cookiejar.LoadError( 'Cookies file must be Netscape formatted, not JSON. See ' - 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl') + 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') continue cf.seek(0) From 1015ceeeaf847bce88b60fe20d08a09ab8ce7d47 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 8 Sep 2022 06:18:35 +0530 Subject: [PATCH 137/284] [extractor/MLBTV] Detect live streams --- yt_dlp/extractor/mlb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index ab0edbae39..5e1b281053 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -343,6 +343,7 @@ def _real_extract(self, url): return { 'id': video_id, 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False), + 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE', 'formats': formats, 'subtitles': subtitles, 'http_headers': {'Authorization': f'Bearer {self._access_token}'}, From ae1035646a6be09c2aed3e22eb8910f341ddacfe Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 8 Sep 2022 15:03:43 +0530 Subject: [PATCH 138/284] Allow a `set` to be passed as `download_archive` --- yt_dlp/YoutubeDL.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a7b8813979..95fa5fb19a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -293,9 +293,8 @@ class YoutubeDL: downloaded. Videos without view count information are always downloaded. None for no limit. - download_archive: File name of a file where all downloads are recorded. - Videos already present in the file are not downloaded - again. + download_archive: A set, or the name of a file where all downloads are recorded. + Videos already present in the file are not downloaded again. break_on_existing: Stop the download process after attempting to download a file that is in the archive. break_on_reject: Stop the download process when encountering a video that @@ -723,21 +722,23 @@ def check_deprecated(param, option, suggestion): def preload_download_archive(fn): """Preload the archive, if any is specified""" + archive = set() if fn is None: - return False + return archive + elif not isinstance(fn, os.PathLike): + return fn + self.write_debug(f'Loading archive file {fn!r}') try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: - self.archive.add(line.strip()) + archive.add(line.strip()) except OSError as ioe: if ioe.errno != errno.ENOENT: raise - return False - return True + return archive - self.archive = set() - preload_download_archive(self.params.get('download_archive')) + self.archive = preload_download_archive(self.params.get('download_archive')) def warn_if_short_id(self, argv): # short YouTube ID starting with dash? @@ -3465,8 +3466,7 @@ def _make_archive_id(self, info_dict): return make_archive_id(extractor, video_id) def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: + if not self.archive: return False vid_ids = [self._make_archive_id(info_dict)] @@ -3479,9 +3479,11 @@ def record_download_archive(self, info_dict): return vid_id = self._make_archive_id(info_dict) assert vid_id + self.write_debug(f'Adding to archive: {vid_id}') - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') + if isinstance(fn, os.PathLike): + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') self.archive.add(vid_id) @staticmethod From 3ffb2f5bea02ad353411981d342e8db79d57fb88 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 9 Sep 2022 12:34:39 +1200 Subject: [PATCH 139/284] [extractor/youtube] Fix video like count extraction Support new combined button layout Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 3ca189e449..6c4e995b8c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3911,19 +3911,24 @@ def process_language(container, base_url, lang_code, sub_name, query): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} - for getter, regex in [( - lambda x: x['defaultText']['accessibility']['accessibilityData'], - r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ - lambda x: x['accessibility'], - lambda x: x['accessibilityData']['accessibilityData'], - ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: - label = (try_get(tbr, getter, dict) or {}).get('label') - if label: - mobj = re.match(regex, label) - if mobj: - info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) - break + tbrs = variadic( + traverse_obj( + tlb, 'toggleButtonRenderer', + ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer'), + default=[])) + for tbr in tbrs: + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: From 0c0b78b273a15f360508f80a2920e39a63b520bc Mon Sep 17 00:00:00 2001 From: CplPwnies <barron879@gmail.com> Date: Thu, 8 Sep 2022 23:52:05 -0500 Subject: [PATCH 140/284] [extractor/adobepass] Add MSO AlticeOne (Optimum TV) (#4875) * Suddenlink rebrand to Optimum. Fixes #4874 Authored by: CplPwnies --- yt_dlp/extractor/adobepass.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index a2666c2b83..ec1be008a3 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1344,6 +1344,11 @@ 'username_field': 'username', 'password_field': 'password', }, + 'AlticeOne': { + 'name': 'Optimum TV', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, } @@ -1705,7 +1710,7 @@ def extract_redirect_url(html, url=None, fatal=False): mso_info.get('username_field', 'username'): username, mso_info.get('password_field', 'password'): password } - if mso_id == 'Cablevision': + if mso_id in ('Cablevision', 'AlticeOne'): form_data['_eventId_proceed'] = '' mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) if mso_id != 'Rogers': From c26f9b991a0681fd3ea548d535919cec1fbbd430 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 9 Sep 2022 05:16:46 +0000 Subject: [PATCH 141/284] [extractor/youtube] Support changing extraction language (#4470) Adds `--extractor-args youtube:lang=<supported lang code>` extractor arg to prefer translated fields (e.g. title and description) of that language, if available, for all YouTube extractors. See README or error message for list of supported language codes. Closes https://github.com/yt-dlp/yt-dlp/issues/387 Authored by: coletdjnz --- README.md | 2 + yt_dlp/extractor/youtube.py | 359 ++++++++++++++++++++++++++++-------- 2 files changed, 286 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 77e597ba01..62c83e721e 100644 --- a/README.md +++ b/README.md @@ -1705,6 +1705,8 @@ #### youtube * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests +* `lang`: Supported content language code to prefer translated metadata of this language (case-sensitive). By default, video primary language metadata is preferred, with a fallback to `en` translated. + * See youtube.py for list of supported content language codes. #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6c4e995b8c..ac1a5f2109 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,6 +2,7 @@ import calendar import copy import datetime +import enum import hashlib import itertools import json @@ -275,6 +276,15 @@ def build_innertube_clients(): build_innertube_clients() +class BadgeType(enum.Enum): + AVAILABILITY_UNLISTED = enum.auto() + AVAILABILITY_PRIVATE = enum.auto() + AVAILABILITY_PUBLIC = enum.auto() + AVAILABILITY_PREMIUM = enum.auto() + AVAILABILITY_SUBSCRIPTION = enum.auto() + LIVE_NOW = enum.auto() + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" @@ -367,6 +377,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?piped\.privacy\.com\.de', ) + # extracted from account/account_menu ep + # XXX: These are the supported YouTube UI and API languages, + # which is slightly different from languages supported for translation in YouTube studio + _SUPPORTED_LANG_CODES = [ + 'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es', + 'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv', + 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', + 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', + 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', + 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko' + ] + + @functools.cached_property + def _preferred_lang(self): + """ + Returns a language code supported by YouTube for the user preferred language. + Returns None if no preferred language set. + """ + preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0] + if not preferred_lang: + return + if preferred_lang not in self._SUPPORTED_LANG_CODES: + raise ExtractorError( + f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.', + expected=True) + elif preferred_lang != 'en': + self.report_warning( + f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.') + return preferred_lang + def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): @@ -391,7 +431,7 @@ def _initialize_pref(self): pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) except ValueError: self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) - pref.update({'hl': 'en', 'tz': 'UTC'}) + pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) def _real_initialize(self): @@ -439,7 +479,7 @@ def _extract_context(self, ytcfg=None, default_client='web'): (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) # Enforce language and tz for extraction client_context = traverse_obj(context, 'client', expected_type=dict, default={}) - client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) + client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context _SAPISID = None @@ -678,13 +718,49 @@ def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) def _extract_badges(self, renderer: dict): - badges = set() - for badge in try_get(renderer, lambda x: x['badges'], list) or []: - label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], str) - if label: - badges.add(label.lower()) + privacy_icon_map = { + 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, + 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + } + + badge_style_map = { + 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + } + + label_map = { + 'unlisted': BadgeType.AVAILABILITY_UNLISTED, + 'private': BadgeType.AVAILABILITY_PRIVATE, + 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'live': BadgeType.LIVE_NOW, + 'premium': BadgeType.AVAILABILITY_PREMIUM + } + + badges = [] + for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer'), default=[]): + badge_type = ( + privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + or badge_style_map.get(traverse_obj(badge, 'style')) + ) + if badge_type: + badges.append({'type': badge_type}) + continue + + # fallback, won't work in some languages + label = traverse_obj(badge, 'label', expected_type=str, default='') + for match, label_badge_type in label_map.items(): + if match in label.lower(): + badges.append({'type': badge_type}) + continue + return badges + @staticmethod + def _has_badge(badges, badge_type): + return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type)) + @staticmethod def _get_text(data, *path_list, max_runs=None): for path in path_list or [None]: @@ -755,9 +831,9 @@ def extract_relative_time(relative_time_text): except ValueError: return None - def _extract_time_text(self, renderer, *path_list): - """@returns (timestamp, time_text)""" - text = self._get_text(renderer, *path_list) or '' + def _parse_time_text(self, text): + if not text: + return dt = self.extract_relative_time(text) timestamp = None if isinstance(dt, datetime.datetime): @@ -770,9 +846,10 @@ def _extract_time_text(self, renderer, *path_list): (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), text.lower(), 'time text', default=None))) - if text and timestamp is None: - self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True) - return timestamp, text + if text and timestamp is None and self._preferred_lang in (None, 'en'): + self.report_warning( + f'Cannot parse localized time text "{text}"', only_once=True) + return timestamp def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, @@ -848,7 +925,7 @@ def _extract_video(self, renderer): channel_id = traverse_obj( renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) - timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') + time_text = self._get_text(renderer, 'publishedTimeText') or '' scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), @@ -874,15 +951,21 @@ def _extract_video(self, renderer): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - 'upload_date': (strftime_or_none(timestamp, '%Y%m%d') + 'upload_date': (strftime_or_none(self._parse_time_text(time_text), '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None), 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() - else 'is_live' if overlay_style == 'LIVE' or 'live now' in badges + else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) else None), 'release_timestamp': scheduled_timestamp, - 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) + 'availability': + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None) } @@ -2306,6 +2389,61 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': [], 'uploader_url': 'http://www.youtube.com/user/nao20010128nao', } + }, { + # Prefer primary title+description language metadata by default + # Do not prefer translated description if primary is empty + 'url': 'https://www.youtube.com/watch?v=el3E4MbxRqQ', + 'info_dict': { + 'id': 'el3E4MbxRqQ', + 'ext': 'mp4', + 'title': 'dlp test video 2 - primary sv no desc', + 'description': '', + 'channel': 'cole-dlp-test-acc', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'like_count': int, + 'playable_in_embed': True, + 'availability': 'unlisted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/el3E4MbxRqQ/maxresdefault.webp', + 'age_limit': 0, + 'duration': 5, + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'live_status': 'not_live', + 'upload_date': '20220908', + 'categories': ['People & Blogs'], + 'uploader': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + }, + 'params': {'skip_download': True} + }, { + # Extractor argument: prefer translated title+description + 'url': 'https://www.youtube.com/watch?v=gHKT4uU8Zng', + 'info_dict': { + 'id': 'gHKT4uU8Zng', + 'ext': 'mp4', + 'channel': 'cole-dlp-test-acc', + 'tags': [], + 'duration': 5, + 'live_status': 'not_live', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'upload_date': '20220728', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'view_count': int, + 'categories': ['People & Blogs'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/gHKT4uU8Zng/maxresdefault.webp', + 'title': 'dlp test video title translated (fr)', + 'availability': 'public', + 'uploader': 'cole-dlp-test-acc', + 'age_limit': 0, + 'description': 'dlp test video description translated (fr)', + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + }, + 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, + 'expected_warnings': [r'Preferring "fr" translated fields'], }, { 'note': '6 channel audio', 'url': 'https://www.youtube.com/watch?v=zgdo7-RRjgo', @@ -2907,8 +3045,10 @@ def _extract_comment(self, comment_renderer, parent=None): text = self._get_text(comment_renderer, 'contentText') - # note: timestamp is an estimate calculated from the current time and time_text - timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') + # Timestamp is an estimate calculated from the current time and time_text + time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' + timestamp = self._parse_time_text(time_text) + author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) @@ -3554,11 +3694,19 @@ def _real_extract(self, url): microformats = traverse_obj( player_responses, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) - video_title = ( - get_first(video_details, 'title') - or self._get_text(microformats, (..., 'title')) - or search_meta(['og:title', 'twitter:title', 'title'])) - video_description = get_first(video_details, 'shortDescription') + + translated_title = self._get_text(microformats, (..., 'title')) + video_title = (self._preferred_lang and translated_title + or get_first(video_details, 'title') # primary + or translated_title + or search_meta(['og:title', 'twitter:title', 'title'])) + translated_description = self._get_text(microformats, (..., 'description')) + original_description = get_first(video_details, 'shortDescription') + video_description = ( + self._preferred_lang and translated_description + # If original description is blank, it will be an empty string. + # Do not prefer translated description in this case. + or original_description if original_description is not None else translated_description) multifeed_metadata_list = get_first( player_responses, @@ -3988,7 +4136,8 @@ def process_language(container, base_url, lang_code, sub_name, query): and info.get('live_status') != 'is_upcoming' and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) ): - upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') or upload_date + upload_date = strftime_or_none( + self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date info['upload_date'] = upload_date for to, frm in fallbacks.items(): @@ -4000,33 +4149,25 @@ def process_language(container, base_url, lang_code, sub_name, query): if v: info[d_k] = v - is_private = get_first(video_details, 'isPrivate', expected_type=bool) - is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool) - is_membersonly = None - is_premium = None - if initial_data and is_private is not None: - is_membersonly = False - is_premium = False - contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - badge_labels = set() - for content in contents: - if not isinstance(content, dict): - continue - badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer'))) - for badge_label in badge_labels: - if badge_label.lower() == 'members only': - is_membersonly = True - elif badge_label.lower() == 'premium': - is_premium = True - elif badge_label.lower() == 'unlisted': - is_unlisted = True + badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) - info['availability'] = self._availability( - is_private=is_private, - needs_premium=is_premium, - needs_subscription=is_membersonly, - needs_auth=info['age_limit'] >= 18, - is_unlisted=None if is_private is None else is_unlisted) + is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or get_first(video_details, 'isPrivate', expected_type=bool)) + + info['availability'] = ( + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=is_private, + needs_premium=( + self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) + or False if initial_data and is_private is not None else None), + needs_subscription=( + self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) + or False if initial_data and is_private is not None else None), + needs_auth=info['age_limit'] >= 18, + is_unlisted=None if is_private is None else ( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or get_first(microformats, 'isUnlisted', expected_type=bool)))) info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) @@ -4472,7 +4613,7 @@ def _get_uncropped(url): playlist_id = item_id playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats') - last_updated_unix, _ = self._extract_time_text(playlist_stats, 2) + last_updated_unix = self._parse_time_text(self._get_text(playlist_stats, 2)) if title is None: title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id title += format_field(selected_tab, 'title', ' - %s') @@ -4566,31 +4707,37 @@ def _extract_availability(self, data): Note: Unless YouTube tells us explicitly, we do not assume it is public @param data: response """ - is_private = is_unlisted = None renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} - badge_labels = self._extract_badges(renderer) + + player_header_privacy = traverse_obj( + data, ('header', 'playlistHeaderRenderer', 'privacy'), expected_type=str) + + badges = self._extract_badges(renderer) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge - privacy_dropdown_entries = try_get( - renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or [] - for renderer_dict in privacy_dropdown_entries: - is_selected = try_get( - renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False - if not is_selected: - continue - label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label')) - if label: - badge_labels.add(label.lower()) - break + privacy_setting_icon = traverse_obj( + renderer, ( + 'privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries', + lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'), + get_all=False, expected_type=str) - for badge_label in badge_labels: - if badge_label == 'unlisted': - is_unlisted = True - elif badge_label == 'private': - is_private = True - elif badge_label == 'public': - is_unlisted = is_private = False - return self._availability(is_private, False, False, False, is_unlisted) + return ( + 'public' if ( + self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + or player_header_privacy == 'PUBLIC' + or privacy_setting_icon == 'PRIVACY_PUBLIC') + else self._availability( + is_private=( + self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or player_header_privacy == 'PRIVATE' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_PRIVATE' if privacy_setting_icon is not None else None), + is_unlisted=( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or player_header_privacy == 'UNLISTED' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None else None), + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_auth=False)) @staticmethod def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): @@ -4866,6 +5013,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'public', }, 'playlist_count': 1, }, { @@ -4883,6 +5031,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'public', }, 'playlist_count': 0, }, { @@ -5029,6 +5178,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008', 'channel': 'Christiaan008', + 'availability': 'public', }, 'playlist_count': 96, }, { @@ -5047,6 +5197,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'view_count': int, 'description': '', 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'availability': 'public', }, 'playlist_mincount': 1123, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5070,6 +5221,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'Interstellar Movie', 'description': '', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'playlist_mincount': 21, }, { @@ -5088,6 +5240,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'playlist_mincount': 200, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5107,6 +5260,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/c/blanktv', 'modified_date': r're:\d{8}', 'description': '', + 'availability': 'public', }, 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5125,6 +5279,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'channel_url': 'https://www.youtube.com/user/Computerphile', 'channel': 'Computerphile', + 'availability': 'public', }, 'playlist_mincount': 11, }, { @@ -5290,6 +5445,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', 'tags': [], 'channel': 'NoCopyrightSounds', + 'availability': 'public', }, 'playlist_mincount': 166, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5310,6 +5466,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': r're:\d{8}', 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', 'description': '', + 'availability': 'public', }, 'expected_warnings': [ 'The URL does not have a videos tab', @@ -5410,6 +5567,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'Royalty Free Music - Topic', 'view_count': int, 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'availability': 'public', }, 'expected_warnings': [ 'does not have a videos tab', @@ -5443,6 +5601,45 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', }, 'playlist_mincount': 2 + }, { + 'note': 'translated tab name', + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/playlists', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'description': '', + 'title': 'cole-dlp-test-acc - 再生リスト', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # XXX: this should really check flat playlist entries, but the test suite doesn't support that + 'note': 'preferred lang set with playlist with translated video titles', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'info_dict': { + 'id': 'PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader': 'cole-dlp-test-acc', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'title': 'dlp test playlist', + 'availability': 'public', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], }] @classmethod @@ -5527,10 +5724,20 @@ def get_mobj(url): tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) if tabs: selected_tab = self._extract_selected_tab(tabs) - selected_tab_name = selected_tab.get('title', '').lower() + selected_tab_url = urljoin( + url, traverse_obj(selected_tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'))) + translated_tab_name = selected_tab.get('title', '').lower() + + # Prefer tab name from tab url as it is always in en, + # but only when preferred lang is set as it may not extract reliably in all cases. + selected_tab_name = (self._preferred_lang in (None, 'en') and translated_tab_name + or selected_tab_url and get_mobj(selected_tab_url)['tab'][1:] # primary + or translated_tab_name) + if selected_tab_name == 'home': selected_tab_name = 'featured' requested_tab_name = mobj['tab'][1:] + if 'no-youtube-channel-redirect' not in compat_opts: if requested_tab_name == 'live': # Live tab should have redirected to the video raise UserNotLive(video_id=mobj['id']) @@ -5642,6 +5849,7 @@ class YoutubePlaylistIE(InfoExtractor): 'channel': 'milan', 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', + 'availability': 'public', }, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { @@ -5660,6 +5868,7 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_url': 'https://www.youtube.com/c/愛低音的國王', 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { @@ -5848,7 +6057,7 @@ def _extract_notification_renderer(self, notification): title = self._search_regex( rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title, 'video title', default=None) - upload_date = (strftime_or_none(self._extract_time_text(notification, 'sentTimeText')[0], '%Y%m%d') + upload_date = (strftime_or_none(self._parse_time_text(self._get_text(notification, 'sentTimeText')), '%Y%m%d') if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE.ie_key()) else None) return { From 0831d95c46e0a198957d44262bb251113346a6b4 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 10 Sep 2022 10:06:48 +1200 Subject: [PATCH 142/284] [extractor/BiliIntl] Support uppercase lang in `_VALID_URL` Seen in some rare cases Authored by: coletdjnz --- yt_dlp/extractor/bilibili.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 7e63dad0f1..2c29bf3ce4 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -909,7 +909,7 @@ def _perform_login(self, username, password): class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))' _TESTS = [{ # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', @@ -952,6 +952,10 @@ class BiliIntlIE(BiliIntlBaseIE): # No language in URL 'url': 'https://www.bilibili.tv/video/2019955076', 'only_matching': True, + }, { + # Uppercase language in URL + 'url': 'https://www.bilibili.tv/EN/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): @@ -975,7 +979,7 @@ def _real_extract(self, url): class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, @@ -993,6 +997,9 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): }, { 'url': 'https://www.biliintl.com/en/play/34613', 'only_matching': True, + }, { + 'url': 'https://www.biliintl.com/EN/play/34613', + 'only_matching': True, }] def _entries(self, series_id): From 0cb0fdbbfe32a0e8bc03c3248b95ec473a98b5cc Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 9 Sep 2022 09:58:41 +0530 Subject: [PATCH 143/284] [extractor/common] Escape `%` in `representation_id` of m3u8 Closes #4877 --- yt_dlp/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 02a4c6cec6..dae952f6a8 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2914,6 +2914,8 @@ def extract_Initialization(source): def prepare_template(template_name, identifiers): tmpl = representation_ms_info[template_name] + if representation_id is not None: + tmpl = tmpl.replace('$RepresentationID$', representation_id) # First of, % characters outside $...$ templates # must be escaped by doubling for proper processing # by % operator string formatting used further (see @@ -2928,8 +2930,6 @@ def prepare_template(template_name, identifiers): t += c # Next, $...$ templates are translated to their # %(...) counterparts to be used with % operator - if representation_id is not None: - t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t.replace('$$', '$') From 941e881e1fe20ee8955f3b751ce26953d9e86656 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 9 Sep 2022 23:14:20 +0530 Subject: [PATCH 144/284] Fix bug in ae1035646a6be09c2aed3e22eb8910f341ddacfe Closes #4881 --- yt_dlp/YoutubeDL.py | 3 ++- yt_dlp/utils.py | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 95fa5fb19a..83b5100eef 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -108,6 +108,7 @@ get_domain, int_or_none, iri_to_uri, + is_path_like, join_nonempty, locked_file, make_archive_id, @@ -725,7 +726,7 @@ def preload_download_archive(fn): archive = set() if fn is None: return archive - elif not isinstance(fn, os.PathLike): + elif not is_path_like(fn): return fn self.write_debug(f'Loading archive file {fn!r}') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 06699341c9..a036e2233b 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1497,6 +1497,10 @@ def https_open(self, req): raise +def is_path_like(f): + return isinstance(f, (str, bytes, os.PathLike)) + + class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): """ See [1] for cookie file format. @@ -1515,7 +1519,7 @@ class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): def __init__(self, filename=None, *args, **kwargs): super().__init__(None, *args, **kwargs) - if self.is_path(filename): + if is_path_like(filename): filename = os.fspath(filename) self.filename = filename @@ -1523,13 +1527,9 @@ def __init__(self, filename=None, *args, **kwargs): def _true_or_false(cndn): return 'TRUE' if cndn else 'FALSE' - @staticmethod - def is_path(file): - return isinstance(file, (str, bytes, os.PathLike)) - @contextlib.contextmanager def open(self, file, *, write=False): - if self.is_path(file): + if is_path_like(file): with open(file, 'w' if write else 'r', encoding='utf-8') as f: yield f else: From deae7c171180ddd4735c414306f084f86ef27e07 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 10 Sep 2022 03:46:54 +0530 Subject: [PATCH 145/284] [cleanup] Misc --- README.md | 14 ++++++++------ yt_dlp/YoutubeDL.py | 6 ++++-- yt_dlp/downloader/common.py | 2 +- yt_dlp/extractor/generic.py | 2 +- yt_dlp/utils.py | 2 +- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 62c83e721e..9f331663d5 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![YT-DLP](https://raw.githubusercontent.com/yt-dlp/yt-dlp/master/.github/banner.svg)](#readme) -[![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](#release-files "Release") +[![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](##installation "Installation") [![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPi") [![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Collaborators.md#collaborators "Donate") [![Matrix](https://img.shields.io/matrix/yt-dlp:matrix.org?color=brightgreen&labelColor=555555&label=&logo=element&style=for-the-badge)](https://matrix.to/#/#yt-dlp:matrix.org "Matrix") @@ -25,6 +25,7 @@ * [NEW FEATURES](#new-features) * [Differences in default behavior](#differences-in-default-behavior) * [INSTALLATION](#installation) + * [Detailed instructions](https://github.com/yt-dlp/yt-dlp/wiki/Installation) * [Update](#update) * [Release Files](#release-files) * [Dependencies](#dependencies) @@ -49,7 +50,6 @@ * [CONFIGURATION](#configuration) * [Authentication with .netrc file](#authentication-with-netrc-file) * [OUTPUT TEMPLATE](#output-template) - * [Output template and Windows batch files](#output-template-and-windows-batch-files) * [Output template examples](#output-template-examples) * [FORMAT SELECTION](#format-selection) * [Filtering Formats](#filtering-formats) @@ -66,6 +66,7 @@ * [Opening an Issue](CONTRIBUTING.md#opening-an-issue) * [Developer Instructions](CONTRIBUTING.md#developer-instructions) * [WIKI](https://github.com/yt-dlp/yt-dlp/wiki) + * [FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ) <!-- MANPAGE: END EXCLUDED SECTION --> @@ -249,7 +250,7 @@ ### Misc * [**secretstorage**](https://github.com/mitya57/secretstorage) - For `--cookies-from-browser` to access the **Gnome** keyring while decrypting cookies of **Chromium**-based browsers on **Linux**. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE) * Any external downloader that you want to use with `--downloader` -#### Deprecated +### Deprecated * [**avconv** and **avprobe**](https://www.libav.org) - Now **deprecated** alternative to ffmpeg. License [depends on the build](https://libav.org/legal) * [**sponskrub**](https://github.com/faissaloo/SponSkrub) - For using the now **deprecated** [sponskrub options](#sponskrub-options). Licensed under [GPLv3+](https://github.com/faissaloo/SponSkrub/blob/master/LICENCE.md) @@ -279,7 +280,7 @@ ### Standalone PyInstaller Builds **Important**: Running `pyinstaller` directly **without** using `pyinst.py` is **not** officially supported. This may or may not work correctly. ### Platform-independent Binary (UNIX) -You will need the build tools `python` (3.6+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. +You will need the build tools `python` (3.7+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. After installing these, simply run `make`. @@ -1705,8 +1706,7 @@ #### youtube * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests -* `lang`: Supported content language code to prefer translated metadata of this language (case-sensitive). By default, video primary language metadata is preferred, with a fallback to `en` translated. - * See youtube.py for list of supported content language codes. +* `lang`: Language code to prefer translated metadata of this language (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -1766,6 +1766,8 @@ # PLUGINS If you are a plugin author, add [ytdlp-plugins](https://github.com/topics/ytdlp-plugins) as a topic to your repository for discoverability +See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins) + # EMBEDDING YT-DLP diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 83b5100eef..3cfd0a6997 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1247,9 +1247,11 @@ def create_key(outer_mobj): delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json - value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt + value, fmt = json.dumps( + value, default=_dumpjson_default, + indent=4 if '#' in flags else None, ensure_ascii=False), str_fmt elif fmt[-1] == 'h': # html - value, fmt = escapeHTML(value), str_fmt + value, fmt = escapeHTML(str(value)), str_fmt elif fmt[-1] == 'q': # quoted value = map(str, variadic(value) if '#' in flags else [value]) value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 9ade4269e8..ab557a47ac 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -121,7 +121,7 @@ def format_seconds(seconds): if time.hours > 99: return '--:--:--' if not time.hours: - return '%02d:%02d' % time[1:-1] + return ' %02d:%02d' % time[1:-1] return '%02d:%02d:%02d' % time[:-1] format_eta = format_seconds diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f53122b20c..af7f93b67d 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2765,7 +2765,7 @@ def _real_extract(self, url): 'age_limit': self._rta_search(webpage), }) - domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') + domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index a036e2233b..666ef67ffb 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3625,7 +3625,7 @@ def determine_protocol(info_dict): ext = determine_ext(url) if ext == 'm3u8': - return 'm3u8' + return 'm3u8' if info_dict.get('is_live') else 'm3u8_native' elif ext == 'f4m': return 'f4m' From 9c935fbc72de8f53c2d65f2ac9ef80b8358e2baf Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 11 Sep 2022 05:10:26 +0530 Subject: [PATCH 146/284] Fix bug in ae1035646a6be09c2aed3e22eb8910f341ddacfe Closes #4890 --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3cfd0a6997..3b6281066b 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3484,7 +3484,7 @@ def record_download_archive(self, info_dict): assert vid_id self.write_debug(f'Adding to archive: {vid_id}') - if isinstance(fn, os.PathLike): + if is_path_like(fn): with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + '\n') self.archive.add(vid_id) From 22df97f9c5ef5aaf6d4451d1c632dee4dc325c5f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 11 Sep 2022 09:02:35 +0000 Subject: [PATCH 147/284] Fix bug in 941e881e1fe20ee8955f3b751ce26953d9e86656 (#4893) Authored by: bashonly --- yt_dlp/cookies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 0ccd22947e..c3b14f03bb 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -30,6 +30,7 @@ YoutubeDLCookieJar, error_to_str, expand_path, + is_path_like, try_call, ) @@ -97,7 +98,7 @@ def load_cookies(cookie_file, browser_specification, ydl): extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) if cookie_file is not None: - is_filename = YoutubeDLCookieJar.is_path(cookie_file) + is_filename = is_path_like(cookie_file) if is_filename: cookie_file = expand_path(cookie_file) From 1060f82f899b61a0a1c63df37ecdf6dc2bae50e8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 13 Sep 2022 16:18:15 +0530 Subject: [PATCH 148/284] Fix `--config-location -` --- yt_dlp/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 666ef67ffb..25910ed6c7 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5554,6 +5554,9 @@ def load_configs(self): self.parsed_args = self.own_args for location in opts.config_locations or []: if location == '-': + if location in self._loaded_paths: + continue + self._loaded_paths.add(location) self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin') continue location = os.path.join(directory, expand_path(location)) From 2314b4d89fc111ddfcb25937210f1f1c2390cc4a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 16 Sep 2022 16:37:38 +0530 Subject: [PATCH 149/284] Allow plugin extractors to replace the built-in ones This allows easier plugin chaining; e.g. - https://gist.github.com/pukkandan/24f13ff1ed385c5a390c1d7bd130d8f7 - https://gist.github.com/pukkandan/fcf5ca1785c80f64e471f0ee14f990fb --- yt_dlp/extractor/common.py | 13 +++++++++++++ yt_dlp/extractor/extractors.py | 7 +++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index dae952f6a8..30042d61fe 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -5,6 +5,7 @@ import http.client import http.cookiejar import http.cookies +import inspect import itertools import json import math @@ -3900,6 +3901,18 @@ def _extract_url(cls, webpage): # TODO: Remove """Only for compatibility with some older extractors""" return next(iter(cls._extract_embed_urls(None, webpage) or []), None) + @classmethod + def __init_subclass__(cls, *, plugin_name=None, **kwargs): + if plugin_name: + mro = inspect.getmro(cls) + super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] + cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + while getattr(super_class, '__wrapped__', None): + super_class = super_class.__wrapped__ + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + + return super().__init_subclass__(**kwargs) + class SearchInfoExtractor(InfoExtractor): """ diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 32818a024a..610e02f906 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -3,6 +3,9 @@ from ..utils import load_plugins +# NB: Must be before other imports so that plugins can be correctly injected +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) + _LAZY_LOADER = False if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): with contextlib.suppress(ImportError): @@ -19,5 +22,5 @@ ] _ALL_CLASSES.append(GenericIE) # noqa: F405 -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) -_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +globals().update(_PLUGIN_CLASSES) +_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() From 2b9d02167fdf2fbe5bd8306144ab45027da263c1 Mon Sep 17 00:00:00 2001 From: Locke <hamannsun@gmail.com> Date: Fri, 16 Sep 2022 23:59:02 +0800 Subject: [PATCH 150/284] [extractor/bilibili] Add space.bilibili extractors (#4468) Authored by: lockmatrix --- yt_dlp/extractor/_extractors.py | 4 +- yt_dlp/extractor/bilibili.py | 144 +++++++++++++++++++++++++------- 2 files changed, 119 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index aedf063f66..6bf769a9e5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -190,7 +190,9 @@ BilibiliAudioIE, BilibiliAudioAlbumIE, BiliBiliPlayerIE, - BilibiliChannelIE, + BilibiliSpaceVideoIE, + BilibiliSpaceAudioIE, + BilibiliSpacePlaylistIE, BiliIntlIE, BiliIntlSeriesIE, BiliLiveIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 2c29bf3ce4..2e03aee856 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2,8 +2,8 @@ import hashlib import itertools import functools -import re import math +import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -13,23 +13,24 @@ ) from ..utils import ( ExtractorError, + InAdvancePagedList, + OnDemandPagedList, filter_dict, - int_or_none, float_or_none, + int_or_none, mimetype2ext, + parse_count, parse_iso8601, qualities, - traverse_obj, - parse_count, smuggle_url, srt_subtitles_timecode, str_or_none, strip_jsonp, + traverse_obj, unified_timestamp, unsmuggle_url, urlencode_postdata, url_or_none, - OnDemandPagedList ) @@ -505,39 +506,126 @@ def _real_extract(self, url): season_info.get('bangumi_title'), season_info.get('evaluate')) -class BilibiliChannelIE(InfoExtractor): - _VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)' - _API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp" +class BilibiliSpaceBaseIE(InfoExtractor): + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + first_page = fetch_page(1) + metadata = get_metadata(first_page) + + paged_list = InAdvancePagedList( + lambda idx: get_entries(fetch_page(idx) if idx > 1 else first_page), + metadata['page_count'], metadata['page_size']) + + return metadata, paged_list + + +class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)(?P<video>/video)?/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://space.bilibili.com/3985676/video', - 'info_dict': {}, - 'playlist_mincount': 112, + 'info_dict': { + 'id': '3985676', + }, + 'playlist_mincount': 178, }] - def _entries(self, list_id): - count, max_count = 0, None + def _real_extract(self, url): + playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') + if not is_video_url: + self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' + 'To download audios, add a "/audio" to the URL') - for page_num in itertools.count(1): - data = self._download_json( - self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/space/arc/search', playlist_id, + note=f'Downloading page {page_idx}', + query={'mid': playlist_id, 'pn': page_idx, 'jsonp': 'jsonp'})['data'] - max_count = max_count or traverse_obj(data, ('page', 'count')) + def get_metadata(page_data): + page_size = page_data['page']['ps'] + entry_count = page_data['page']['count'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + } - entries = traverse_obj(data, ('list', 'vlist')) - if not entries: - return - for entry in entries: - yield self.url_result( - 'https://www.bilibili.com/video/%s' % entry['bvid'], - BiliBiliIE.ie_key(), entry['bvid']) + def get_entries(page_data): + for entry in traverse_obj(page_data, ('list', 'vlist')) or []: + yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', BiliBiliIE, entry['bvid']) - count += len(entries) - if max_count and count >= max_count: - return + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id) + + +class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio' + _TESTS = [{ + 'url': 'https://space.bilibili.com/3985676/audio', + 'info_dict': { + 'id': '3985676', + }, + 'playlist_mincount': 1, + }] def _real_extract(self, url): - list_id = self._match_id(url) - return self.playlist_result(self._entries(list_id), list_id) + playlist_id = self._match_id(url) + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id, + note=f'Downloading page {page_idx}', + query={'uid': playlist_id, 'pn': page_idx, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data'] + + def get_metadata(page_data): + return { + 'page_count': page_data['pageCount'], + 'page_size': page_data['pageSize'], + } + + def get_entries(page_data): + for entry in page_data.get('data', []): + yield self.url_result(f'https://www.bilibili.com/audio/au{entry["id"]}', BilibiliAudioIE, entry['id']) + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id) + + +class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', + 'info_dict': { + 'id': '2142762_57445', + 'title': '《底特律 变人》' + }, + 'playlist_mincount': 31, + }] + + def _real_extract(self, url): + mid, sid = self._match_valid_url(url).group('mid', 'sid') + playlist_id = f'{mid}_{sid}' + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/polymer/space/seasons_archives_list', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': mid, 'season_id': sid, 'page_num': page_idx, 'page_size': 30})['data'] + + def get_metadata(page_data): + page_size = page_data['page']['page_size'] + entry_count = page_data['page']['total'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + 'title': traverse_obj(page_data, ('meta', 'name')) + } + + def get_entries(page_data): + for entry in page_data.get('archives', []): + yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', + BiliBiliIE, entry['bvid']) + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id, metadata['title']) class BilibiliCategoryIE(InfoExtractor): From fc2ba496fd09ca68c7e6eeb2c11e7000d08ff099 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Sat, 17 Sep 2022 01:04:23 +0900 Subject: [PATCH 151/284] Allow open ranges for time ranges (#4940) Authored by: Lesmiscore --- yt_dlp/YoutubeDL.py | 5 +++-- yt_dlp/__init__.py | 11 ++++++----- yt_dlp/options.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3b6281066b..0bfc47767a 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2711,17 +2711,18 @@ def to_screen(*msg): (f['format_id'] for f in formats_to_download)) if requested_ranges: to_screen(f'Downloading {len(requested_ranges)} time ranges:', - (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges)) + (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges)) max_downloads_reached = False for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]): new_info = self._copy_infodict(info_dict) new_info.update(fmt) offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') + end_time = offset + min(chapter.get('end_time', duration), duration) if chapter or offset: new_info.update({ 'section_start': offset + chapter.get('start_time', 0), - 'section_end': offset + min(chapter.get('end_time', duration), duration), + 'section_end': end_time if end_time < offset + duration else None, 'section_title': chapter.get('title'), 'section_number': chapter.get('index'), }) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 356155fcdd..87d431c6e0 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -326,14 +326,15 @@ def validate_outtmpl(tmpl, msg): def parse_chapters(name, value): chapters, ranges = [], [] + parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x) for regex in value or []: if regex.startswith('*'): - for range in regex[1:].split(','): - dur = tuple(map(parse_duration, range.strip().split('-'))) - if len(dur) == 2 and all(t is not None for t in dur): - ranges.append(dur) - else: + for range_ in map(str.strip, regex[1:].split(',')): + mobj = range_ != '-' and re.fullmatch(r'([^-]+)?\s*-\s*([^-]+)?', range_) + dur = mobj and (parse_timestamp(mobj.group(1) or '0'), parse_timestamp(mobj.group(2) or 'inf')) + if None in (dur or [None]): raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end') + ranges.append(dur) continue try: chapters.append(re.compile(regex)) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 26392f6193..9ad48486e8 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -964,7 +964,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Download only chapters whose title matches the given regular expression. ' 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' 'Needs ffmpeg. This option can be used multiple times to download multiple sections, ' - 'e.g. --download-sections "*10:15-15:00" --download-sections "intro"')) + 'e.g. --download-sections "*10:15-inf" --download-sections "intro"')) downloader.add_option( '--downloader', '--external-downloader', dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str', From 5736d79172c47ff84740d5720467370a560febad Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 16 Sep 2022 18:24:29 +0530 Subject: [PATCH 152/284] Support environment variables in `--ffmpeg-location` Closes #4938 --- yt_dlp/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 87d431c6e0..cab2dd62f9 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -920,6 +920,7 @@ def _real_main(argv=None): # We may need ffmpeg_location without having access to the YoutubeDL instance # See https://github.com/yt-dlp/yt-dlp/issues/2191 if opts.ffmpeg_location: + opts.ffmpeg_location = expand_path(opts.ffmpeg_location) FFmpegPostProcessor._ffmpeg_location.set(opts.ffmpeg_location) with YoutubeDL(ydl_opts) as ydl: From 8817a80d3ac69f2dfd12bdc41657c4a04139807c Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Fri, 16 Sep 2022 19:02:00 +0200 Subject: [PATCH 153/284] [cookies] Parse cookies leniently (#4780) Closes #4776, #3778 Authored by: Grub4K --- test/test_cookies.py | 146 +++++++++++++++++++++++++++++++++++++ yt_dlp/cookies.py | 96 ++++++++++++++++++++++++ yt_dlp/extractor/common.py | 3 +- 3 files changed, 244 insertions(+), 1 deletion(-) diff --git a/test/test_cookies.py b/test/test_cookies.py index cfeb11b552..61619df297 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -3,6 +3,7 @@ from yt_dlp import cookies from yt_dlp.cookies import ( + LenientSimpleCookie, LinuxChromeCookieDecryptor, MacChromeCookieDecryptor, WindowsChromeCookieDecryptor, @@ -137,3 +138,148 @@ def test_safari_cookie_parsing(self): def test_pbkdf2_sha1(self): key = pbkdf2_sha1(b'peanuts', b' ' * 16, 1, 16) self.assertEqual(key, b'g\xe1\x8e\x0fQ\x1c\x9b\xf3\xc9`!\xaa\x90\xd9\xd34') + + +class TestLenientSimpleCookie(unittest.TestCase): + def _run_tests(self, *cases): + for message, raw_cookie, expected in cases: + cookie = LenientSimpleCookie(raw_cookie) + + with self.subTest(message, expected=expected): + self.assertEqual(cookie.keys(), expected.keys(), message) + + for key, expected_value in expected.items(): + morsel = cookie[key] + if isinstance(expected_value, tuple): + expected_value, expected_attributes = expected_value + else: + expected_attributes = {} + + attributes = { + key: value + for key, value in dict(morsel).items() + if value != "" + } + self.assertEqual(attributes, expected_attributes, message) + + self.assertEqual(morsel.value, expected_value, message) + + def test_parsing(self): + self._run_tests( + # Copied from https://github.com/python/cpython/blob/v3.10.7/Lib/test/test_http_cookies.py + ( + "Test basic cookie", + "chips=ahoy; vienna=finger", + {"chips": "ahoy", "vienna": "finger"}, + ), + ( + "Test quoted cookie", + 'keebler="E=mc2; L=\\"Loves\\"; fudge=\\012;"', + {"keebler": 'E=mc2; L="Loves"; fudge=\012;'}, + ), + ( + "Allow '=' in an unquoted value", + "keebler=E=mc2", + {"keebler": "E=mc2"}, + ), + ( + "Allow cookies with ':' in their name", + "key:term=value:term", + {"key:term": "value:term"}, + ), + ( + "Allow '[' and ']' in cookie values", + "a=b; c=[; d=r; f=h", + {"a": "b", "c": "[", "d": "r", "f": "h"}, + ), + ( + "Test basic cookie attributes", + 'Customer="WILE_E_COYOTE"; Version=1; Path=/acme', + {"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})}, + ), + ( + "Test flag only cookie attributes", + 'Customer="WILE_E_COYOTE"; HttpOnly; Secure', + {"Customer": ("WILE_E_COYOTE", {"httponly": True, "secure": True})}, + ), + ( + "Test flag only attribute with values", + "eggs=scrambled; httponly=foo; secure=bar; Path=/bacon", + {"eggs": ("scrambled", {"httponly": "foo", "secure": "bar", "path": "/bacon"})}, + ), + ( + "Test special case for 'expires' attribute, 4 digit year", + 'Customer="W"; expires=Wed, 01 Jan 2010 00:00:00 GMT', + {"Customer": ("W", {"expires": "Wed, 01 Jan 2010 00:00:00 GMT"})}, + ), + ( + "Test special case for 'expires' attribute, 2 digit year", + 'Customer="W"; expires=Wed, 01 Jan 98 00:00:00 GMT', + {"Customer": ("W", {"expires": "Wed, 01 Jan 98 00:00:00 GMT"})}, + ), + ( + "Test extra spaces in keys and values", + "eggs = scrambled ; secure ; path = bar ; foo=foo ", + {"eggs": ("scrambled", {"secure": True, "path": "bar"}), "foo": "foo"}, + ), + ( + "Test quoted attributes", + 'Customer="WILE_E_COYOTE"; Version="1"; Path="/acme"', + {"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})} + ), + # Our own tests that CPython passes + ( + "Allow ';' in quoted value", + 'chips="a;hoy"; vienna=finger', + {"chips": "a;hoy", "vienna": "finger"}, + ), + ( + "Keep only the last set value", + "a=c; a=b", + {"a": "b"}, + ), + ) + + def test_lenient_parsing(self): + self._run_tests( + ( + "Ignore and try to skip invalid cookies", + 'chips={"ahoy;": 1}; vienna="finger;"', + {"vienna": "finger;"}, + ), + ( + "Ignore cookies without a name", + "a=b; unnamed; c=d", + {"a": "b", "c": "d"}, + ), + ( + "Ignore '\"' cookie without name", + 'a=b; "; c=d', + {"a": "b", "c": "d"}, + ), + ( + "Skip all space separated values", + "x a=b c=d x; e=f", + {"a": "b", "c": "d", "e": "f"}, + ), + ( + "Skip all space separated values", + 'x a=b; data={"complex": "json", "with": "key=value"}; x c=d x', + {"a": "b", "c": "d"}, + ), + ( + "Expect quote mending", + 'a=b; invalid="; c=d', + {"a": "b", "c": "d"}, + ), + ( + "Reset morsel after invalid to not capture attributes", + "a=b; invalid; Version=1; c=d", + {"a": "b", "c": "d"}, + ), + ( + "Continue after non-flag attribute without value", + "a=b; path; Version=1; c=d", + {"a": "b", "c": "d"}, + ), + ) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index c3b14f03bb..d502e91da6 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,6 +1,7 @@ import base64 import contextlib import http.cookiejar +import http.cookies import json import os import re @@ -990,3 +991,98 @@ def _parse_browser_specification(browser_name, profile=None, keyring=None, conta if profile is not None and _is_path(profile): profile = os.path.expanduser(profile) return browser_name, profile, keyring, container + + +class LenientSimpleCookie(http.cookies.SimpleCookie): + """More lenient version of http.cookies.SimpleCookie""" + # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py + _LEGAL_KEY_CHARS = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=" + _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + r"\[\]" + + _RESERVED = { + "expires", + "path", + "comment", + "domain", + "max-age", + "secure", + "httponly", + "version", + "samesite", + } + + _FLAGS = {"secure", "httponly"} + + # Added 'bad' group to catch the remaining value + _COOKIE_PATTERN = re.compile(r""" + \s* # Optional whitespace at start of cookie + (?P<key> # Start of group 'key' + [""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + ( # Start of potential value + (?P<val> # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + [""" + _LEGAL_VALUE_CHARS + r"""]* # Any word or empty string + ) # End of group 'val' + | # or + (?P<bad>(?:\\;|[^;])*?) # 'bad' group fallback for invalid values + ) # End of potential value + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """, re.ASCII | re.VERBOSE) + + def load(self, data): + # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4776 + if not isinstance(data, str): + return super().load(data) + + morsel = None + index = 0 + length = len(data) + + while 0 <= index < length: + match = self._COOKIE_PATTERN.search(data, index) + if not match: + break + + index = match.end(0) + if match.group("bad"): + morsel = None + continue + + key, value = match.group("key", "val") + + if key[0] == "$": + if morsel is not None: + morsel[key[1:]] = True + continue + + lower_key = key.lower() + if lower_key in self._RESERVED: + if morsel is None: + continue + + if value is None: + if lower_key not in self._FLAGS: + morsel = None + continue + value = True + else: + value, _ = self.value_decode(value) + + morsel[key] = value + + elif value is not None: + morsel = self.get(key, http.cookies.Morsel()) + real_value, coded_value = self.value_decode(value) + morsel.set(key, real_value, coded_value) + self[key] = morsel + + else: + morsel = None diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 30042d61fe..e8fa8fdde8 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -22,6 +22,7 @@ from ..compat import functools # isort: split from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..cookies import LenientSimpleCookie from ..downloader import FileDownloader from ..downloader.f4m import get_base_url, remove_encrypted_media from ..utils import ( @@ -3632,7 +3633,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None, def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return http.cookies.SimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader._calc_cookies(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 3166e6840c7f7b1ea3984f0e40a892d87e690480 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 16 Sep 2022 23:05:49 +0530 Subject: [PATCH 154/284] [extractor/generic] Pass through referer from json-ld Closes #4941 --- yt_dlp/extractor/generic.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index af7f93b67d..55b3addde4 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2621,7 +2621,7 @@ def _real_extract(self, url): default_search += ':' return self.url_result(default_search + url) - url, smuggled_data = unsmuggle_url(url) + url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None is_intentional = smuggled_data and smuggled_data.get('to_generic') if smuggled_data and 'force_videoid' in smuggled_data: @@ -2638,7 +2638,10 @@ def _real_extract(self, url): # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. - full_response = self._request_webpage(url, video_id, headers={'Accept-Encoding': '*'}) + full_response = self._request_webpage(url, video_id, headers={ + 'Accept-Encoding': '*', + **smuggled_data.get('http_headers', {}) + }) new_url = full_response.geturl() if url != new_url: self.report_following_redirect(new_url) @@ -2657,14 +2660,15 @@ def _real_extract(self, url): m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') + headers = smuggled_data.get('http_headers', {}) format_id = str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): - formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): - formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) + formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ 'format_id': format_id, @@ -2673,8 +2677,11 @@ def _real_extract(self, url): }] info_dict['direct'] = True self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles + info_dict.update({ + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': headers, + }) return info_dict if not self.get_param('test', False) and not is_intentional: @@ -2919,7 +2926,11 @@ def _real_extract(self, url): self.report_detected('JSON LD') return merge_dicts({ '_type': 'url_transparent', - 'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}), + 'url': smuggle_url(json_ld['url'], { + 'force_videoid': video_id, + 'to_generic': True, + 'http_headers': {'Referer': url}, + }), }, json_ld, info_dict) def check_video(vurl): From 2b24afa6d7f0ed09a663b4483d29f7c05258edfe Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 17 Sep 2022 10:14:44 +0530 Subject: [PATCH 155/284] Improve 5736d79172c47ff84740d5720467370a560febad --- yt_dlp/__init__.py | 4 +++- yt_dlp/cookies.py | 4 ++-- yt_dlp/utils.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index cab2dd62f9..29c467b0e8 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -411,6 +411,9 @@ def metadataparser_actions(f): if opts.download_archive is not None: opts.download_archive = expand_path(opts.download_archive) + if opts.ffmpeg_location is not None: + opts.ffmpeg_location = expand_path(opts.ffmpeg_location) + if opts.user_agent is not None: opts.headers.setdefault('User-Agent', opts.user_agent) if opts.referer is not None: @@ -920,7 +923,6 @@ def _real_main(argv=None): # We may need ffmpeg_location without having access to the YoutubeDL instance # See https://github.com/yt-dlp/yt-dlp/issues/2191 if opts.ffmpeg_location: - opts.ffmpeg_location = expand_path(opts.ffmpeg_location) FFmpegPostProcessor._ffmpeg_location.set(opts.ffmpeg_location) with YoutubeDL(ydl_opts) as ydl: diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index d502e91da6..24a8250dab 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -988,8 +988,8 @@ def _parse_browser_specification(browser_name, profile=None, keyring=None, conta raise ValueError(f'unsupported browser: "{browser_name}"') if keyring not in (None, *SUPPORTED_KEYRINGS): raise ValueError(f'unsupported keyring: "{keyring}"') - if profile is not None and _is_path(profile): - profile = os.path.expanduser(profile) + if profile is not None and _is_path(expand_path(profile)): + profile = expand_path(profile) return browser_name, profile, keyring, container diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 25910ed6c7..a24ca828e0 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -762,7 +762,7 @@ def sanitized_Request(url, *args, **kwargs): def expand_path(s): - """Expand shell variables and ~""" + """Expand $ shell variables and ~""" return os.path.expandvars(compat_expanduser(s)) From 9665f15a960c4e274b0be5fbf22e6f4a6680d162 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 17 Sep 2022 11:34:04 +0530 Subject: [PATCH 156/284] [outtmpl] Make `%s` work in strfformat for all systems --- yt_dlp/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index a24ca828e0..f6f7c38d10 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2567,6 +2567,8 @@ def strftime_or_none(timestamp, date_format, default=None): datetime_object = datetime.datetime.utcfromtimestamp(timestamp) elif isinstance(timestamp, str): # assume YYYYMMDD datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') + date_format = re.sub( # Support %s on windows + r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format) return datetime_object.strftime(date_format) except (ValueError, TypeError, AttributeError): return default From dab284f80fb08675008eec39a4561fed1cf1617b Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 17 Sep 2022 11:57:47 +0530 Subject: [PATCH 157/284] Workaround `libc_ver` not be available on Windows Store version of Python --- yt_dlp/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index f6f7c38d10..443c498148 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1966,13 +1966,16 @@ def system_identifier(): python_implementation = platform.python_implementation() if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'): python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3] + libc_ver = [] + with contextlib.suppress(OSError): # We may not have access to the executable + libc_ver = platform.libc_ver() return 'Python %s (%s %s) - %s %s' % ( platform.python_version(), python_implementation, platform.architecture()[0], platform.platform(), - format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'), + format_field(join_nonempty(*libc_ver, delim=' '), None, '(%s)'), ) From 19b4e59a1e1bf368078f90e7f735fa4576f97b64 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sat, 17 Sep 2022 20:54:21 +0530 Subject: [PATCH 158/284] [extractor/web.archive:youtube] Fix _YT_INITIAL_PLAYER_RESPONSE_RE --- yt_dlp/extractor/archiveorg.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 0f40774ce1..25a289ff62 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -526,9 +526,10 @@ class YoutubeWebArchiveIE(InfoExtractor): }, ] _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE - _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x) + _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x: (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*| - {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}''' + {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE} + )''' _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers _YT_ALL_THUMB_SERVERS = orderedSet( From 46d72cd2c7fced093189babb484d53766f52ef57 Mon Sep 17 00:00:00 2001 From: josanabr <john.sanabria@correounivalle.edu.co> Date: Sun, 18 Sep 2022 09:32:28 -0500 Subject: [PATCH 159/284] [devscripts] make_lazy_extractors: Fix for Docker (#4958) Authored by: josanabr --- devscripts/make_lazy_extractors.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 43885331f8..383c7e057c 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -3,6 +3,7 @@ # Allow direct execution import os import sys +import shutil sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -50,12 +51,13 @@ def get_all_ies(): PLUGINS_DIRNAME = 'ytdlp_plugins' BLOCKED_DIRNAME = f'{PLUGINS_DIRNAME}_blocked' if os.path.exists(PLUGINS_DIRNAME): - os.rename(PLUGINS_DIRNAME, BLOCKED_DIRNAME) + # os.rename cannot be used, e.g. in Docker. See https://github.com/yt-dlp/yt-dlp/pull/4958 + shutil.move(PLUGINS_DIRNAME, BLOCKED_DIRNAME) try: from yt_dlp.extractor.extractors import _ALL_CLASSES finally: if os.path.exists(BLOCKED_DIRNAME): - os.rename(BLOCKED_DIRNAME, PLUGINS_DIRNAME) + shutil.move(BLOCKED_DIRNAME, PLUGINS_DIRNAME) return _ALL_CLASSES From fada8272b6c86ec43f0ccdeaa7bd29baecb4ba2d Mon Sep 17 00:00:00 2001 From: Jeroen Jacobs <github.com@jeroenj.be> Date: Sun, 18 Sep 2022 16:42:58 +0200 Subject: [PATCH 160/284] [extractor/GoPlay] Add extractor (#3412) Replaces old Vier extractors Closes https://github.com/yt-dlp/yt-dlp/issues/1546 Based on: https://github.com/ytdl-org/youtube-dl/pull/27815 Authored by: jeroenj, CNugteren, basrieter --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/goplay.py | 395 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/vier.py | 261 --------------------- 3 files changed, 396 insertions(+), 262 deletions(-) create mode 100644 yt_dlp/extractor/goplay.py delete mode 100644 yt_dlp/extractor/vier.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6bf769a9e5..43e2f93d35 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -649,6 +649,7 @@ ) from .googlesearch import GoogleSearchIE from .gopro import GoProIE +from .goplay import GoPlayIE from .goshgay import GoshgayIE from .gotostage import GoToStageIE from .gputechconf import GPUTechConfIE @@ -2021,7 +2022,6 @@ VidioLiveIE ) from .vidlii import VidLiiIE -from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, ViewLiftEmbedIE, diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py new file mode 100644 index 0000000000..31267e1aa2 --- /dev/null +++ b/yt_dlp/extractor/goplay.py @@ -0,0 +1,395 @@ +import base64 +import binascii +import datetime +import hashlib +import hmac +import json +import os + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unescapeHTML, +) + + +class GoPlayIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)' + + _NETRC_MACHINE = 'goplay' + + _TESTS = [{ + 'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay', + 'info_dict': { + 'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811', + 'ext': 'mp4', + 'title': 'S3 - Aflevering 2', + 'series': 'De Container Cup', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 2', + 'episode_number': 2, + }, + 'skip': 'This video is only available for registered users' + }, { + 'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay', + 'info_dict': { + 'id': '74e3ed07-748c-49e4-85a0-393a93337dbf', + 'ext': 'mp4', + 'title': 'A Family for the Holidays', + }, + 'skip': 'This video is only available for registered users' + }] + + _id_token = None + + def _perform_login(self, username, password): + self.report_login() + aws = AwsIdp(ie=self, pool_id='eu-west-1_dViSsKM5Y', client_id='6s1h851s8uplco5h6mqh1jac8m') + self._id_token, _ = aws.authenticate(username=username, password=password) + + def _real_initialize(self): + if not self._id_token: + raise self.raise_login_required(method='password') + + def _real_extract(self, url): + url, display_id = self._match_valid_url(url).group(0, 'display_id') + webpage = self._download_webpage(url, display_id) + video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data') + video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data') + + movie = video_data.get('movie') + if movie: + video_id = movie['videoUuid'] + info_dict = { + 'title': movie.get('title') + } + else: + episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False) + video_id = episode['videoUuid'] + info_dict = { + 'title': episode.get('episodeTitle'), + 'series': traverse_obj(episode, ('program', 'title')), + 'season_number': episode.get('seasonNumber'), + 'episode_number': episode.get('episodeNumber'), + } + + api = self._download_json( + f'https://api.viervijfzes.be/content/{video_id}', + video_id, headers={'Authorization': self._id_token}) + + formats, subs = self._extract_m3u8_formats_and_subtitles( + api['video']['S'], video_id, ext='mp4', m3u8_id='HLS') + self._sort_formats(formats) + + info_dict.update({ + 'id': video_id, + 'formats': formats, + }) + + return info_dict + + +# Taken from https://github.com/add-ons/plugin.video.viervijfzes/blob/master/resources/lib/viervijfzes/auth_awsidp.py +# Released into Public domain by https://github.com/michaelarnauts + +class InvalidLoginException(ExtractorError): + """ The login credentials are invalid """ + + +class AuthenticationException(ExtractorError): + """ Something went wrong while logging in """ + + +class AwsIdp: + """ AWS Identity Provider """ + + def __init__(self, ie, pool_id, client_id): + """ + :param InfoExtrator ie: The extractor that instantiated this class. + :param str pool_id: The AWS user pool to connect to (format: <region>_<poolid>). + E.g.: eu-west-1_aLkOfYN3T + :param str client_id: The client application ID (the ID of the application connecting) + """ + + self.ie = ie + + self.pool_id = pool_id + if "_" not in self.pool_id: + raise ValueError("Invalid pool_id format. Should be <region>_<poolid>.") + + self.client_id = client_id + self.region = self.pool_id.split("_")[0] + self.url = "https://cognito-idp.%s.amazonaws.com/" % (self.region,) + + # Initialize the values + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L22 + self.n_hex = 'FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1' + \ + '29024E088A67CC74020BBEA63B139B22514A08798E3404DD' + \ + 'EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245' + \ + 'E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED' + \ + 'EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D' + \ + 'C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F' + \ + '83655D23DCA3AD961C62F356208552BB9ED529077096966D' + \ + '670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B' + \ + 'E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9' + \ + 'DE2BCBF6955817183995497CEA956AE515D2261898FA0510' + \ + '15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64' + \ + 'ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7' + \ + 'ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B' + \ + 'F12FFA06D98A0864D87602733EC86A64521F2B18177B200C' + \ + 'BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31' + \ + '43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF' + + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L49 + self.g_hex = '2' + self.info_bits = bytearray('Caldera Derived Key', 'utf-8') + + self.big_n = self.__hex_to_long(self.n_hex) + self.g = self.__hex_to_long(self.g_hex) + self.k = self.__hex_to_long(self.__hex_hash('00' + self.n_hex + '0' + self.g_hex)) + self.small_a_value = self.__generate_random_small_a() + self.large_a_value = self.__calculate_a() + + def authenticate(self, username, password): + """ Authenticate with a username and password. """ + # Step 1: First initiate an authentication request + auth_data_dict = self.__get_authentication_request(username) + auth_data = json.dumps(auth_data_dict).encode("utf-8") + auth_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.InitiateAuth", + "Accept-Encoding": "identity", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=auth_data, headers=auth_headers, + note='Authenticating username', errnote='Invalid username') + challenge_parameters = auth_response_json.get("ChallengeParameters") + + if auth_response_json.get("ChallengeName") != "PASSWORD_VERIFIER": + raise AuthenticationException(auth_response_json["message"]) + + # Step 2: Respond to the Challenge with a valid ChallengeResponse + challenge_request = self.__get_challenge_response_request(challenge_parameters, password) + challenge_data = json.dumps(challenge_request).encode("utf-8") + challenge_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.RespondToAuthChallenge", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=challenge_data, headers=challenge_headers, + note='Authenticating password', errnote='Invalid password') + + if 'message' in auth_response_json: + raise InvalidLoginException(auth_response_json['message']) + return ( + auth_response_json['AuthenticationResult']['IdToken'], + auth_response_json['AuthenticationResult']['RefreshToken'] + ) + + def __get_authentication_request(self, username): + """ + + :param str username: The username to use + + :return: A full Authorization request. + :rtype: dict + """ + auth_request = { + "AuthParameters": { + "USERNAME": username, + "SRP_A": self.__long_to_hex(self.large_a_value) + }, + "AuthFlow": "USER_SRP_AUTH", + "ClientId": self.client_id + } + return auth_request + + def __get_challenge_response_request(self, challenge_parameters, password): + """ Create a Challenge Response Request object. + + :param dict[str,str|imt] challenge_parameters: The parameters for the challenge. + :param str password: The password. + + :return: A valid and full request data object to use as a response for a challenge. + :rtype: dict + """ + user_id = challenge_parameters["USERNAME"] + user_id_for_srp = challenge_parameters["USER_ID_FOR_SRP"] + srp_b = challenge_parameters["SRP_B"] + salt = challenge_parameters["SALT"] + secret_block = challenge_parameters["SECRET_BLOCK"] + + timestamp = self.__get_current_timestamp() + + # Get a HKDF key for the password, SrpB and the Salt + hkdf = self.__get_hkdf_key_for_password( + user_id_for_srp, + password, + self.__hex_to_long(srp_b), + salt + ) + secret_block_bytes = base64.standard_b64decode(secret_block) + + # the message is a combo of the pool_id, provided SRP userId, the Secret and Timestamp + msg = \ + bytearray(self.pool_id.split('_')[1], 'utf-8') + \ + bytearray(user_id_for_srp, 'utf-8') + \ + bytearray(secret_block_bytes) + \ + bytearray(timestamp, 'utf-8') + hmac_obj = hmac.new(hkdf, msg, digestmod=hashlib.sha256) + signature_string = base64.standard_b64encode(hmac_obj.digest()).decode('utf-8') + challenge_request = { + "ChallengeResponses": { + "USERNAME": user_id, + "TIMESTAMP": timestamp, + "PASSWORD_CLAIM_SECRET_BLOCK": secret_block, + "PASSWORD_CLAIM_SIGNATURE": signature_string + }, + "ChallengeName": "PASSWORD_VERIFIER", + "ClientId": self.client_id + } + return challenge_request + + def __get_hkdf_key_for_password(self, username, password, server_b_value, salt): + """ Calculates the final hkdf based on computed S value, and computed U value and the key. + + :param str username: Username. + :param str password: Password. + :param int server_b_value: Server B value. + :param int salt: Generated salt. + + :return Computed HKDF value. + :rtype: object + """ + + u_value = self.__calculate_u(self.large_a_value, server_b_value) + if u_value == 0: + raise ValueError('U cannot be zero.') + username_password = '%s%s:%s' % (self.pool_id.split('_')[1], username, password) + username_password_hash = self.__hash_sha256(username_password.encode('utf-8')) + + x_value = self.__hex_to_long(self.__hex_hash(self.__pad_hex(salt) + username_password_hash)) + g_mod_pow_xn = pow(self.g, x_value, self.big_n) + int_value2 = server_b_value - self.k * g_mod_pow_xn + s_value = pow(int_value2, self.small_a_value + u_value * x_value, self.big_n) + hkdf = self.__compute_hkdf( + bytearray.fromhex(self.__pad_hex(s_value)), + bytearray.fromhex(self.__pad_hex(self.__long_to_hex(u_value))) + ) + return hkdf + + def __compute_hkdf(self, ikm, salt): + """ Standard hkdf algorithm + + :param {Buffer} ikm Input key material. + :param {Buffer} salt Salt value. + :return {Buffer} Strong key material. + """ + + prk = hmac.new(salt, ikm, hashlib.sha256).digest() + info_bits_update = self.info_bits + bytearray(chr(1), 'utf-8') + hmac_hash = hmac.new(prk, info_bits_update, hashlib.sha256).digest() + return hmac_hash[:16] + + def __calculate_u(self, big_a, big_b): + """ Calculate the client's value U which is the hash of A and B + + :param int big_a: Large A value. + :param int big_b: Server B value. + + :return Computed U value. + :rtype: int + """ + + u_hex_hash = self.__hex_hash(self.__pad_hex(big_a) + self.__pad_hex(big_b)) + return self.__hex_to_long(u_hex_hash) + + def __generate_random_small_a(self): + """ Helper function to generate a random big integer + + :return a random value. + :rtype: int + """ + random_long_int = self.__get_random(128) + return random_long_int % self.big_n + + def __calculate_a(self): + """ Calculate the client's public value A = g^a%N with the generated random number a + + :return Computed large A. + :rtype: int + """ + + big_a = pow(self.g, self.small_a_value, self.big_n) + # safety check + if (big_a % self.big_n) == 0: + raise ValueError('Safety check for A failed') + return big_a + + @staticmethod + def __long_to_hex(long_num): + return '%x' % long_num + + @staticmethod + def __hex_to_long(hex_string): + return int(hex_string, 16) + + @staticmethod + def __hex_hash(hex_string): + return AwsIdp.__hash_sha256(bytearray.fromhex(hex_string)) + + @staticmethod + def __hash_sha256(buf): + """AuthenticationHelper.hash""" + digest = hashlib.sha256(buf).hexdigest() + return (64 - len(digest)) * '0' + digest + + @staticmethod + def __pad_hex(long_int): + """ Converts a Long integer (or hex string) to hex format padded with zeroes for hashing + + :param int|str long_int: Number or string to pad. + + :return Padded hex string. + :rtype: str + """ + + if not isinstance(long_int, str): + hash_str = AwsIdp.__long_to_hex(long_int) + else: + hash_str = long_int + if len(hash_str) % 2 == 1: + hash_str = '0%s' % hash_str + elif hash_str[0] in '89ABCDEFabcdef': + hash_str = '00%s' % hash_str + return hash_str + + @staticmethod + def __get_random(nbytes): + random_hex = binascii.hexlify(os.urandom(nbytes)) + return AwsIdp.__hex_to_long(random_hex) + + @staticmethod + def __get_current_timestamp(): + """ Creates a timestamp with the correct English format. + + :return: timestamp in format 'Sun Jan 27 19:00:04 UTC 2019' + :rtype: str + """ + + # We need US only data, so we cannot just do a strftime: + # Sun Jan 27 19:00:04 UTC 2019 + months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + time_now = datetime.datetime.utcnow() + format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) + time_string = datetime.datetime.utcnow().strftime(format_string) + return time_string + + def __str__(self): + return "AWS IDP Client for:\nRegion: %s\nPoolId: %s\nAppId: %s" % ( + self.region, self.pool_id.split("_")[1], self.client_id + ) diff --git a/yt_dlp/extractor/vier.py b/yt_dlp/extractor/vier.py deleted file mode 100644 index eab894ab63..0000000000 --- a/yt_dlp/extractor/vier.py +++ /dev/null @@ -1,261 +0,0 @@ -import re -import itertools - -from .common import InfoExtractor -from ..utils import ( - urlencode_postdata, - int_or_none, - unified_strdate, -) - - -class VierIE(InfoExtractor): - IE_NAME = 'vier' - IE_DESC = 'vier.be and vijf.be' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?(?P<site>vier|vijf)\.be/ - (?: - (?: - [^/]+/videos| - video(?:/[^/]+)* - )/ - (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| - (?: - video/v3/embed| - embed/video/public - )/(?P<embed_id>\d+) - ) - ''' - _NETRC_MACHINE = 'vier' - _TESTS = [{ - 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', - 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', - 'info_dict': { - 'id': '16129', - 'display_id': 'het-wordt-warm-de-moestuin', - 'ext': 'mp4', - 'title': 'Het wordt warm in De Moestuin', - 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - 'upload_date': '20121025', - 'series': 'Plan B', - 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], - }, - }, { - 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', - 'info_dict': { - 'id': '2561614', - 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', - 'ext': 'mp4', - 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', - 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', - 'upload_date': '20170228', - 'series': 'Temptation Island', - 'tags': list, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', - 'info_dict': { - 'id': '2674839', - 'display_id': 'jani-gaat-naar-tokio-aflevering-4', - 'ext': 'mp4', - 'title': 'Jani gaat naar Tokio - Aflevering 4', - 'description': 'md5:aa8d611541db6ae9e863125704511f88', - 'upload_date': '20170501', - 'series': 'Jani gaat', - 'episode_number': 4, - 'tags': ['Jani Gaat', 'Volledige Aflevering'], - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', - }, { - # Requires account credentials but bypassed extraction via v3/embed page - # without metadata - 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', - 'info_dict': { - 'id': '2674839', - 'display_id': 'jani-gaat-naar-tokio-aflevering-4', - 'ext': 'mp4', - 'title': 'jani-gaat-naar-tokio-aflevering-4', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Log in to extract metadata'], - }, { - # Without video id in URL - 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', - 'only_matching': True, - }, { - 'url': 'http://www.vier.be/video/v3/embed/16129', - 'only_matching': True, - }, { - 'url': 'https://www.vijf.be/embed/video/public/4093', - 'only_matching': True, - }, { - 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', - 'only_matching': True, - }, { - 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', - 'only_matching': True, - }] - - def _real_initialize(self): - self._logged_in = False - - def _login(self, site): - username, password = self._get_login_info() - if username is None or password is None: - return - - login_page = self._download_webpage( - 'http://www.%s.be/user/login' % site, - None, note='Logging in', errnote='Unable to log in', - data=urlencode_postdata({ - 'form_id': 'user_login', - 'name': username, - 'pass': password, - }), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - login_error = self._html_search_regex( - r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', - login_page, 'login error', default=None) - if login_error: - self.report_warning('Unable to log in: %s' % login_error) - else: - self._logged_in = True - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - embed_id = mobj.group('embed_id') - display_id = mobj.group('display_id') or embed_id - video_id = mobj.group('id') or embed_id - site = mobj.group('site') - - if not self._logged_in: - self._login(site) - - webpage = self._download_webpage(url, display_id) - - if r'id="user-login"' in webpage: - self.report_warning( - 'Log in to extract metadata', video_id=display_id) - webpage = self._download_webpage( - 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), - display_id) - - video_id = self._search_regex( - [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id', default=video_id or display_id) - - playlist_url = self._search_regex( - r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', - webpage, 'm3u8 url', default=None, group='url') - - if not playlist_url: - application = self._search_regex( - [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default=site + '_vod') - filename = self._search_regex( - [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], - webpage, 'filename') - playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - - formats = self._extract_wowza_formats( - playlist_url, display_id, skip_protocols=['dash']) - self._sort_formats(formats) - - title = self._og_search_title(webpage, default=display_id) - description = self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', - webpage, 'description', default=None, group='value') - thumbnail = self._og_search_thumbnail(webpage, default=None) - upload_date = unified_strdate(self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', - webpage, 'upload date', default=None, group='value')) - - series = self._search_regex( - r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'series', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?i)aflevering (\d+)', title, 'episode number', default=None)) - tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'series': series, - 'episode_number': episode_number, - 'tags': tags, - 'formats': formats, - } - - -class VierVideosIE(InfoExtractor): - IE_NAME = 'vier:videos' - _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' - _TESTS = [{ - 'url': 'http://www.vier.be/demoestuin/videos', - 'info_dict': { - 'id': 'demoestuin', - }, - 'playlist_mincount': 153, - }, { - 'url': 'http://www.vijf.be/temptationisland/videos', - 'info_dict': { - 'id': 'temptationisland', - }, - 'playlist_mincount': 159, - }, { - 'url': 'http://www.vier.be/demoestuin/videos?page=6', - 'info_dict': { - 'id': 'demoestuin-page6', - }, - 'playlist_mincount': 20, - }, { - 'url': 'http://www.vier.be/demoestuin/videos?page=7', - 'info_dict': { - 'id': 'demoestuin-page7', - }, - 'playlist_mincount': 13, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - program = mobj.group('program') - site = mobj.group('site') - - page_id = mobj.group('page') - if page_id: - page_id = int(page_id) - start_page = page_id - playlist_id = '%s-page%d' % (program, page_id) - else: - start_page = 0 - playlist_id = program - - entries = [] - for current_page_id in itertools.count(start_page): - current_page = self._download_webpage( - 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), - program, - 'Downloading page %d' % (current_page_id + 1)) - page_entries = [ - self.url_result('http://www.' + site + '.be' + video_url, 'Vier') - for video_url in re.findall( - r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] - entries.extend(page_entries) - if page_id or '>Meer<' not in current_page: - break - - return self.playlist_result(entries, playlist_id) From f7c5a5e96756636379a0b1afbeadb08b9c643bef Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 21 Sep 2022 09:12:54 +0000 Subject: [PATCH 161/284] [extractor/tiktok] Fix TikTokIE (#4984) Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index c585383942..4a35a241c7 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -25,7 +25,7 @@ class TikTokBaseIE(InfoExtractor): - _APP_VERSIONS = [('20.9.3', '293'), ('20.4.3', '243'), ('20.2.1', '221'), ('20.1.2', '212'), ('20.0.4', '204')] + _APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')] _WORKING_APP_VERSION = None _APP_NAME = 'trill' _AID = 1180 @@ -33,7 +33,6 @@ class TikTokBaseIE(InfoExtractor): _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') - _session_initialized = False @staticmethod def _create_url(user_id, video_id): @@ -43,12 +42,6 @@ def _get_sigi_state(self, webpage, display_id): return self._parse_json(get_element_by_id( 'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id) - def _real_initialize(self): - if self._session_initialized: - return - self._request_webpage(HEADRequest('https://www.tiktok.com'), None, note='Setting up session', fatal=False) - TikTokBaseIE._session_initialized = True - def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) @@ -289,7 +282,7 @@ def extract_addr(addr, add_meta={}): 'uploader_url': user_url, 'track': music_track, 'album': str_or_none(music_info.get('album')) or None, - 'artist': music_author, + 'artist': music_author or None, 'timestamp': int_or_none(aweme_detail.get('create_time')), 'formats': formats, 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), @@ -522,7 +515,7 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['trying feed workaround', 'Unable to find video in feed'] + 'skip': 'This video is unavailable', }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', @@ -530,18 +523,11 @@ class TikTokIE(TikTokBaseIE): }] def _extract_aweme_app(self, aweme_id): - try: - aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video details', errnote='Unable to download video details').get('aweme_detail') - if not aweme_detail: - raise ExtractorError('Video not available', video_id=aweme_id) - except ExtractorError as e: - self.report_warning(f'{e.orig_msg}; trying feed workaround') - feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] - aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) - if not aweme_detail: - raise ExtractorError('Unable to find video in feed', video_id=aweme_id) + feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, + note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] + aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + if not aweme_detail: + raise ExtractorError('Unable to find video in feed', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) def _real_extract(self, url): @@ -572,6 +558,7 @@ def _real_extract(self, url): class TikTokUserIE(TikTokBaseIE): IE_NAME = 'tiktok:user' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])' + _WORKING = False _TESTS = [{ 'url': 'https://tiktok.com/@corgibobaa?lang=en', 'playlist_mincount': 45, @@ -708,6 +695,7 @@ def _real_extract(self, url): class TikTokSoundIE(TikTokBaseListIE): IE_NAME = 'tiktok:sound' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _WORKING = False _QUERY_NAME = 'music_id' _API_ENDPOINT = 'music/aweme' _TESTS = [{ @@ -731,6 +719,7 @@ class TikTokSoundIE(TikTokBaseListIE): class TikTokEffectIE(TikTokBaseListIE): IE_NAME = 'tiktok:effect' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _WORKING = False _QUERY_NAME = 'sticker_id' _API_ENDPOINT = 'sticker/aweme' _TESTS = [{ @@ -750,6 +739,7 @@ class TikTokEffectIE(TikTokBaseListIE): class TikTokTagIE(TikTokBaseListIE): IE_NAME = 'tiktok:tag' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)' + _WORKING = False _QUERY_NAME = 'ch_id' _API_ENDPOINT = 'challenge/aweme' _TESTS = [{ From b27bc13af6a2a96f66f5209151dd2965a7c514fe Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Sep 2022 01:23:22 +0530 Subject: [PATCH 162/284] [extractor/patreon] Sort formats --- yt_dlp/extractor/patreon.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 529aba178c..43c90c8f16 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -277,6 +277,7 @@ def _real_extract(self, url): } elif name == 'video': formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) + self._sort_formats(formats) return { **info, 'formats': formats, From 8ca48a1a5427040fd708f33a264c10d5d0e85fc1 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Sep 2022 01:53:37 +0530 Subject: [PATCH 163/284] [extractor] Fix `fatal=False` in `RetryManager` --- yt_dlp/extractor/amazon.py | 2 +- yt_dlp/extractor/common.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py index 56a8d844ac..9e9e9772da 100644 --- a/yt_dlp/extractor/amazon.py +++ b/yt_dlp/extractor/amazon.py @@ -39,7 +39,7 @@ class AmazonStoreIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) - for retry in self.RetryManager(fatal=True): + for retry in self.RetryManager(): webpage = self._download_webpage(url, id) try: data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e8fa8fdde8..4132c831cd 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3857,8 +3857,10 @@ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_l return True def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True): - RetryManager.report_retry(err, _count or int(fatal), _retries, info=self.to_screen, warn=self.report_warning, - sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) + RetryManager.report_retry( + err, _count or int(fatal), _retries, + info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning, + sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) def RetryManager(self, **kwargs): return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs) From 2fa669f759eae6d5c7e608e3ee628f9d60d03e83 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Thu, 22 Sep 2022 01:37:44 +0530 Subject: [PATCH 164/284] [docs] Misc improvements Closes #4987, Closes #4906, Closes #4919, Closes #4977, Closes #4979 --- README.md | 34 +++++++++++++++--------------- devscripts/make_lazy_extractors.py | 2 +- setup.cfg | 8 +++++++ yt_dlp/__init__.py | 2 +- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/cybrary.py | 5 ++--- yt_dlp/extractor/generic.py | 4 ++-- yt_dlp/extractor/niconico.py | 3 +-- yt_dlp/options.py | 4 ++-- yt_dlp/utils.py | 13 ++++++++---- yt_dlp/webvtt.py | 1 - 11 files changed, 44 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 9f331663d5..07ed04061c 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![YT-DLP](https://raw.githubusercontent.com/yt-dlp/yt-dlp/master/.github/banner.svg)](#readme) -[![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](##installation "Installation") +[![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](#installation "Installation") [![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPi") [![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Collaborators.md#collaborators "Donate") [![Matrix](https://img.shields.io/matrix/yt-dlp:matrix.org?color=brightgreen&labelColor=555555&label=&logo=element&style=for-the-badge)](https://matrix.to/#/#yt-dlp:matrix.org "Matrix") @@ -562,7 +562,7 @@ ## Download Options: Needs ffmpeg. This option can be used multiple times to download multiple sections, e.g. --download-sections - "*10:15-15:00" --download-sections "intro" + "*10:15-inf" --download-sections "intro" --downloader [PROTO:]NAME Name or path of the external downloader to use (optionally) prefixed by the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to @@ -1079,9 +1079,9 @@ ## Extractor Options: --no-hls-split-discontinuity Do not split HLS playlists to different formats at discontinuities such as ad breaks (default) - --extractor-args KEY:ARGS Pass these arguments to the extractor. See - "EXTRACTOR ARGUMENTS" for details. You can - use this option multiple times to give + --extractor-args IE_KEY:ARGS Pass ARGS arguments to the IE_KEY extractor. + See "EXTRACTOR ARGUMENTS" for details. You + can use this option multiple times to give arguments for different extractors # CONFIGURATION @@ -1092,14 +1092,14 @@ # CONFIGURATION 1. **Portable Configuration**: `yt-dlp.conf` in the same directory as the bundled binary. If you are running from source-code (`<root dir>/yt_dlp/__main__.py`), the root directory is used instead. 1. **Home Configuration**: `yt-dlp.conf` in the home path given by `-P`, or in the current directory if no such path is given 1. **User Configuration**: - * `%XDG_CONFIG_HOME%/yt-dlp/config` (recommended on Linux/macOS) - * `%XDG_CONFIG_HOME%/yt-dlp.conf` - * `%APPDATA%/yt-dlp/config` (recommended on Windows) - * `%APPDATA%/yt-dlp/config.txt` + * `$XDG_CONFIG_HOME/yt-dlp/config` (recommended on Linux/macOS) + * `$XDG_CONFIG_HOME/yt-dlp.conf` + * `$APPDATA/yt-dlp/config` (recommended on Windows) + * `$APPDATA/yt-dlp/config.txt` * `~/yt-dlp.conf` * `~/yt-dlp.conf.txt` - `%XDG_CONFIG_HOME%` defaults to `~/.config` if undefined. On windows, `%APPDATA%` generally points to `C:\Users\<user name>\AppData\Roaming` and `~` points to `%HOME%` if present, `%USERPROFILE%` (generally `C:\Users\<user name>`), or `%HOMEDRIVE%%HOMEPATH%` + `$XDG_CONFIG_HOME` defaults to `~/.config` if undefined. On windows, `$APPDATA` generally points to `C:\Users\<user name>\AppData\Roaming` and `~` points to `$HOME` if present, `$USERPROFILE` (generally `C:\Users\<user name>`), or `${HOMEDRIVE}${HOMEPATH}` 1. **System Configuration**: `/etc/yt-dlp.conf` @@ -1120,7 +1120,7 @@ # Save all videos under YouTube directory in your home directory -o ~/YouTube/%(title)s.%(ext)s ``` -Note that options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. +Note that options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary as-if it were a UNIX shell. You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded. @@ -1148,7 +1148,7 @@ ### Authentication with `.netrc` file ``` To activate authentication with the `.netrc` file you should pass `--netrc` to yt-dlp or place it in the [configuration file](#configuration). -The default location of the .netrc file is `$HOME` (`~`) in UNIX. On Windows, it is `%HOME%` if present, `%USERPROFILE%` (generally `C:\Users\<user name>`) or `%HOMEDRIVE%%HOMEPATH%` +The default location of the .netrc file is `$HOME` (`~`). On Windows, if `$HOME` is not present, `$USERPROFILE` (generally `C:\Users\<user name>`) or `${HOMEDRIVE}${HOMEPATH}` is used # OUTPUT TEMPLATE @@ -1627,7 +1627,7 @@ # MODIFYING METADATA The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or an [output template](#output-template) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. -Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`. +Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--embed-metadata`. This option also has a few special uses: @@ -1673,11 +1673,11 @@ # Set title as "Series name S01E05" $ yt-dlp --parse-metadata "%(series)s S%(season_number)02dE%(episode_number)02d:%(title)s" # Prioritize uploader as the "artist" field in video metadata -$ yt-dlp --parse-metadata "%(uploader|)s:%(meta_artist)s" --add-metadata +$ yt-dlp --parse-metadata "%(uploader|)s:%(meta_artist)s" --embed-metadata # Set "comment" field in video metadata using description instead of webpage_url, # handling multiple lines correctly -$ yt-dlp --parse-metadata "description:(?s)(?P<meta_comment>.+)" --add-metadata +$ yt-dlp --parse-metadata "description:(?s)(?P<meta_comment>.+)" --embed-metadata # Do not set any "synopsis" in the video metadata $ yt-dlp --parse-metadata ":(?P<meta_synopsis>)" @@ -1697,16 +1697,16 @@ # EXTRACTOR ARGUMENTS The following extractors use this feature: #### youtube +* `lang`: Language code to prefer translated metadata of this language (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively * `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details -* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total +* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests -* `lang`: Language code to prefer translated metadata of this language (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 383c7e057c..2d4530eb96 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -2,8 +2,8 @@ # Allow direct execution import os -import sys import shutil +import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/setup.cfg b/setup.cfg index d33c7d8549..2def390f51 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,6 +10,14 @@ per_file_ignores = devscripts/lazy_load_template.py: F401 +[autoflake] +ignore-init-module-imports = true +ignore-pass-after-docstring = true +remove-all-unused-imports = true +remove-duplicate-keys = true +remove-unused-variables = true + + [tool:pytest] addopts = -ra -v --strict-markers markers = diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 29c467b0e8..9382ff43ba 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -489,7 +489,7 @@ def report_conflict(arg1, opt1, arg2='--allow-unplayable-formats', opt2='allow_u val1=opts.sponskrub and opts.sponskrub_cut) # Conflicts with --allow-unplayable-formats - report_conflict('--add-metadata', 'addmetadata') + report_conflict('--embed-metadata', 'addmetadata') report_conflict('--embed-chapters', 'addchapters') report_conflict('--embed-info-json', 'embed_infojson') report_conflict('--embed-subs', 'embedsubtitles') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4132c831cd..87660bb23b 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1236,7 +1236,7 @@ def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal, has_default = False, True json_string = self._search_regex( - rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}', + rf'(?:{start_pattern})\s*(?P<json>{{\s*(?:{contains_pattern})\s*}})\s*(?:{end_pattern})', string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) if not json_string: return default diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py index 7da581828e..73f2439b31 100644 --- a/yt_dlp/extractor/cybrary.py +++ b/yt_dlp/extractor/cybrary.py @@ -1,11 +1,10 @@ -from .common import InfoExtractor - +from .common import InfoExtractor from ..utils import ( ExtractorError, smuggle_url, str_or_none, traverse_obj, - urlencode_postdata + urlencode_postdata, ) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 55b3addde4..828c8a6cff 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2623,8 +2623,8 @@ def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: + is_intentional = smuggled_data.get('to_generic') + if 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 82fb27631e..82b60b4765 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -557,8 +557,7 @@ class NiconicoPlaylistBaseIE(InfoExtractor): } def _call_api(self, list_id, resource, query): - "Implement this in child class" - pass + raise NotImplementedError('Must be implemented in subclasses') @staticmethod def _parse_owner(item): diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 9ad48486e8..861bbf7864 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1820,14 +1820,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): val.replace(r'\,', ',').strip() for val in re.split(r'(?<!\\),', vals)]) extractor.add_option( '--extractor-args', - metavar='KEY:ARGS', dest='extractor_args', default={}, type='str', + metavar='IE_KEY:ARGS', dest='extractor_args', default={}, type='str', action='callback', callback=_dict_from_options_callback, callback_kwargs={ 'multiple_keys': False, 'process': lambda val: dict( _extractor_arg_parser(*arg.split('=', 1)) for arg in val.split(';')) }, help=( - 'Pass these arguments to the extractor. See "EXTRACTOR ARGUMENTS" for details. ' + 'Pass ARGS arguments to the IE_KEY extractor. See "EXTRACTOR ARGUMENTS" for details. ' 'You can use this option multiple times to give arguments for different extractors')) extractor.add_option( '--youtube-include-dash-manifest', '--no-youtube-skip-dash-manifest', diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 443c498148..26ef3c7dd1 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -591,9 +591,14 @@ def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs): def decode(self, s): if self.transform_source: s = self.transform_source(s) - if self.ignore_extra: - return self.raw_decode(s.lstrip())[0] - return super().decode(s) + try: + if self.ignore_extra: + return self.raw_decode(s.lstrip())[0] + return super().decode(s) + except json.JSONDecodeError as e: + if e.pos is not None: + raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos) + raise def sanitize_open(filename, open_mode): @@ -762,7 +767,7 @@ def sanitized_Request(url, *args, **kwargs): def expand_path(s): - """Expand $ shell variables and ~""" + """Expand shell variables and ~""" return os.path.expandvars(compat_expanduser(s)) diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index 23d67a8971..1138865ba3 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -140,7 +140,6 @@ class HeaderBlock(Block): A WebVTT block that may only appear in the header part of the file, i.e. before any cue blocks. """ - pass From 163281178a61565cd592426d452978ff47e63439 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 21 Sep 2022 20:53:08 +0000 Subject: [PATCH 165/284] [extractor/wistia] Match IDs in embed URLs (#4990) Closes #4985 Authored by: bashonly --- yt_dlp/extractor/generic.py | 35 +++++++++++++++++++++++++---------- yt_dlp/extractor/wistia.py | 16 ++++++++++++++++ 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 828c8a6cff..fadc0819b0 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -876,17 +876,19 @@ class GenericIE(InfoExtractor): # Wistia embed { 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', + 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, + 'id': '5vd7p4bct5', + 'ext': 'bin', + 'title': 'md5:db27290a04ae306319b0b5cce3cdf7bd', + 'description': 'md5:e835b7808e11aaef29ccdc28888437af', + 'duration': 623.019, 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, + 'timestamp': 1663258727, + 'upload_date': '20220915', + 'filesize': 29798093, + 'age_limit': 0, + 'thumbnail': r're:^https?://.+\.jpg$', }, }, # Wistia standard embed (async) @@ -903,7 +905,20 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'webpage 404 not found', + }, + # Wistia embed with video IDs in query + { + 'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt', + 'info_dict': { + 'id': 'md5:922795280019b3a70ca133330a4b0108', + 'title': 'Amplify Sessions - Amplitude', + 'description': 'md5:3d271bdee219417bb1c35eeb0937b923', + 'age_limit': 0, + 'thumbnail': r're:^https?://.+\.jpg$', + }, + 'playlist_count': 3, }, # Soundcloud embed { diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index 4388286241..ba7497493f 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -131,6 +131,20 @@ class WistiaIE(WistiaBaseIE): 'timestamp': 1463607249, 'duration': 4987.11, }, + 'skip': 'webpage 404 not found', + }, { + 'url': 'wistia:5vd7p4bct5', + 'md5': 'b9676d24bf30945d97060638fbfe77f0', + 'info_dict': { + 'id': '5vd7p4bct5', + 'ext': 'bin', + 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', + 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', + 'upload_date': '20220915', + 'timestamp': 1663258727, + 'duration': 623.019, + 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$', + }, }, { 'url': 'wistia:sh7fpupwlt', 'only_matching': True, @@ -157,6 +171,8 @@ def _extract_embed_urls(cls, url, webpage): urls.append('wistia:%s' % match.group('id')) for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:wmediaid|wvideo(?:id)?)(?:%5D)?=(?P<id>[a-z0-9]{10})', url): + urls.append('wistia:%s' % match.group('id')) return urls @classmethod From 1c09783f7ad6653001cb1788cbc6de635d44a4c4 Mon Sep 17 00:00:00 2001 From: GautamMKGarg <GautamMKgarg@gmail.com> Date: Thu, 22 Sep 2022 06:48:48 +0530 Subject: [PATCH 166/284] [extractor/hungama] Add subtitle (#4856) Authored by: GautamMKGarg, pukkandan --- yt_dlp/extractor/hungama.py | 44 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/hungama.py b/yt_dlp/extractor/hungama.py index 938a242962..717f50a834 100644 --- a/yt_dlp/extractor/hungama.py +++ b/yt_dlp/extractor/hungama.py @@ -20,15 +20,17 @@ class HungamaIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', - 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'md5': '687c5f1e9f832f3b59f44ed0eb1f120a', 'info_dict': { - 'id': '2931166', + 'id': '39349649', 'ext': 'mp4', - 'title': 'Lucky Ali - Kitni Haseen Zindagi', - 'track': 'Kitni Haseen Zindagi', - 'artist': 'Lucky Ali', - 'album': 'Aks', - 'release_year': 2000, + 'title': 'Krishna Chants', + 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'upload_date': '20180829', + 'duration': 264, + 'timestamp': 1535500800, + 'view_count': int, + 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', } }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', @@ -40,12 +42,7 @@ class HungamaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - info = self._search_json_ld(webpage, video_id) - - m3u8_url = self._download_json( + video_json = self._download_json( 'https://www.hungama.com/index.php', video_id, data=urlencode_postdata({'content_id': video_id}), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', @@ -53,18 +50,25 @@ def _real_extract(self, url): }, query={ 'c': 'common', 'm': 'get_video_mdn_url', - })['stream_url'] + }) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') self._sort_formats(formats) - info.update({ + json_ld = self._search_json_ld( + self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + + return { + **json_ld, 'id': video_id, 'formats': formats, - }) - return info + 'subtitles': { + 'en': [{ + 'url': video_json['sub_title'], + 'ext': 'vtt', + }] + } if video_json.get('sub_title') else None, + } class HungamaSongIE(InfoExtractor): From 4cca2eb1bf8bb830df15cbcda21a93fe2392573a Mon Sep 17 00:00:00 2001 From: Tanner Anderson <me@tanner.technology> Date: Wed, 21 Sep 2022 19:44:07 -0600 Subject: [PATCH 167/284] [extractor/nebula] Add nebula.tv (#4918) Closes #4917 Authored by: tannertechnology --- yt_dlp/extractor/nebula.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 7057b8b26e..861fcb1643 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..utils import ExtractorError, parse_iso8601, try_get +_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' + class NebulaBaseIE(InfoExtractor): _NETRC_MACHINE = 'watchnebula' @@ -148,7 +150,7 @@ def _perform_login(self, username=None, password=None): class NebulaIE(NebulaBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' + _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' _TESTS = [ { 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', @@ -246,7 +248,7 @@ def _real_extract(self, url): class NebulaSubscriptionsIE(NebulaBaseIE): IE_NAME = 'nebula:subscriptions' - _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/myshows' + _VALID_URL = rf'{_BASE_URL_RE}/myshows' _TESTS = [ { 'url': 'https://nebula.app/myshows', @@ -274,7 +276,7 @@ def _real_extract(self, url): class NebulaChannelIE(NebulaBaseIE): IE_NAME = 'nebula:channel' - _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!myshows|videos/)(?P<id>[-\w]+)' + _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' _TESTS = [ { 'url': 'https://nebula.app/tom-scott-presents-money', From 80eb0bd9b94106df9e1e5ac288def6e239937329 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Thu, 22 Sep 2022 05:39:02 +0000 Subject: [PATCH 168/284] [extractor/youtube] Add support for Shorts audio pivot feed (#4932) This feed shows Shorts using the audio of a given video. ytshortsap: prefix can be used as a shortcut until YouTube implements an official view. Closes #4911 Authored by: coletdjnz --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/youtube.py | 41 +++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 43e2f93d35..e247871361 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -21,6 +21,7 @@ YoutubeYtBeIE, YoutubeYtUserIE, YoutubeWatchLaterIE, + YoutubeShortsAudioPivotIE ) from .abc import ( diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ac1a5f2109..2afb993d01 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4327,8 +4327,8 @@ def _playlist_entries(self, video_list_renderer): yield self._extract_video(renderer) def _rich_entries(self, rich_grid_renderer): - renderer = try_get( - rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} + renderer = traverse_obj( + rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} video_id = renderer.get('videoId') if not video_id: return @@ -5640,6 +5640,16 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1, 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, 'expected_warnings': ['Preferring "ja"'], + }, { + # shorts audio pivot for 2GtVksBMYFM. + 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + 'info_dict': { + 'id': 'sfv_audio_pivot', + 'title': 'sfv_audio_pivot', + 'tags': [], + }, + 'playlist_mincount': 50, + }] @classmethod @@ -6307,6 +6317,33 @@ def _real_extract(self, url): ie=YoutubeTabIE, video_id=playlist_id) +class YoutubeShortsAudioPivotIE(InfoExtractor): + IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video); "ytshortsap:" prefix' + IE_NAME = 'youtube:shorts:pivot:audio' + _VALID_URL = f'(?x)^ytshortsap:{YoutubeIE._VALID_URL[5:]}' + _TESTS = [{ + 'url': 'ytshortsap:https://www.youtube.com/shorts/Lyj-MZSAA9o?feature=share', + 'only_matching': True, + }, { + 'url': 'ytshortsap:Lyj-MZSAA9o', + 'only_matching': True, + }] + + @staticmethod + def _generate_audio_pivot_params(video_id): + """ + Generates sfv_audio_pivot browse params for this video id + """ + pb_params = b'\xf2\x05+\n)\x12\'\n\x0b%b\x12\x0b%b\x1a\x0b%b' % ((video_id.encode(),) * 3) + return urllib.parse.quote(base64.b64encode(pb_params).decode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://www.youtube.com/feed/sfv_audio_pivot?bp={self._generate_audio_pivot_params(video_id)}', + ie=YoutubeTabIE) + + class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list From 2e7675489f4323c17c8de1e1fd264365c2c36e26 Mon Sep 17 00:00:00 2001 From: Pritam Das <49360491+pritam20ps05@users.noreply.github.com> Date: Thu, 22 Sep 2022 16:27:20 +0530 Subject: [PATCH 169/284] [extractor/instagram] Extract more metadata (#4708) Authored by: pritam20ps05 --- yt_dlp/extractor/instagram.py | 148 ++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 59 deletions(-) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index e997a3fbb7..c9da7e36f6 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -173,18 +173,9 @@ def _extract_product(self, product_info): if isinstance(product_info, list): product_info = product_info[0] - comment_data = traverse_obj(product_info, ('edge_media_to_parent_comment', 'edges')) - comments = [{ - 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), - 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), - 'id': traverse_obj(comment_dict, ('node', 'id')), - 'text': traverse_obj(comment_dict, ('node', 'text')), - 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), - } for comment_dict in comment_data] if comment_data else None - user_info = product_info.get('user') or {} info_dict = { - 'id': product_info.get('code') or _pk_to_id(product_info.get('pk')), + 'id': _pk_to_id(traverse_obj(product_info, 'pk', 'id', expected_type=str_or_none)[:19]), 'title': product_info.get('title') or f'Video by {user_info.get("username")}', 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), 'timestamp': int_or_none(product_info.get('taken_at')), @@ -194,7 +185,7 @@ def _extract_product(self, product_info): 'view_count': int_or_none(product_info.get('view_count')), 'like_count': int_or_none(product_info.get('like_count')), 'comment_count': int_or_none(product_info.get('comment_count')), - 'comments': comments, + '__post_extractor': self.extract_comments(_pk_to_id(product_info.get('pk'))), 'http_headers': { 'Referer': 'https://www.instagram.com/', } @@ -216,6 +207,23 @@ def _extract_product(self, product_info): **self._extract_product_media(product_info) } + def _get_comments(self, video_id): + comments_info = self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id, + fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {} + + comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments') + for comment_dict in comment_data or []: + yield { + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username'), ('user', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id'), ('user', 'pk')), + 'author_thumbnail': traverse_obj(comment_dict, ('node', 'owner', 'profile_pic_url'), ('user', 'profile_pic_url'), expected_type=url_or_none), + 'id': traverse_obj(comment_dict, ('node', 'id'), 'pk'), + 'text': traverse_obj(comment_dict, ('node', 'text'), 'text'), + 'like_count': traverse_obj(comment_dict, ('node', 'edge_liked_by', 'count'), 'comment_like_count', expected_type=int_or_none), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), 'created_at', expected_type=int_or_none), + } + class InstagramIOSIE(InfoExtractor): IE_DESC = 'IOS instagram:// URL' @@ -258,7 +266,7 @@ class InstagramIE(InstagramBaseIE): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 8.747, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': '2815873', @@ -268,27 +276,34 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { - # missing description - 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + # reel + 'url': 'https://www.instagram.com/reel/Chunk8-jurw/', + 'md5': 'f6d8277f74515fa3ff9f5791426e42b1', 'info_dict': { - 'id': 'BA-pQFBG8HZ', + 'id': 'Chunk8-jurw', 'ext': 'mp4', - 'title': 'Video by britneyspears', + 'title': 'Video by instagram', + 'description': 'md5:c9cde483606ed6f80fbe9283a6a2b290', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, - 'timestamp': 1453760977, - 'upload_date': '20160125', - 'uploader_id': '12246775', - 'uploader': 'Britney Spears', - 'channel': 'britneyspears', + 'duration': 5.016, + 'timestamp': 1661529231, + 'upload_date': '20220826', + 'uploader_id': '25025320', + 'uploader': 'Instagram', + 'channel': 'instagram', 'like_count': int, 'comment_count': int, 'comments': list, }, - 'params': { - 'skip_download': True, - }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # multi video post 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', @@ -297,18 +312,24 @@ class InstagramIE(InstagramBaseIE): 'id': 'BQ0dSaohpPW', 'ext': 'mp4', 'title': 'Video 1', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dTpOhuHT', 'ext': 'mp4', 'title': 'Video 2', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dT7RBFeF', 'ext': 'mp4', 'title': 'Video 3', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }], 'info_dict': { @@ -316,6 +337,10 @@ class InstagramIE(InstagramBaseIE): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # IGTV 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', @@ -334,7 +359,11 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', - } + }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -367,6 +396,15 @@ def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') media, webpage = {}, '' + if self._get_cookies(url).get('sessionid'): + info = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, + fatal=False, errnote='Video info extraction failed', + note='Downloading video info', headers=self._API_HEADERS), ('items', 0)) + if info: + media.update(info) + return self._extract_product(media) + api_check = self._download_json( f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} @@ -374,40 +412,32 @@ def _real_extract(self, url): if not csrf_token: self.report_warning('No csrf token set by Instagram API', video_id) - elif api_check.get('status') != 'ok': - self.report_warning('Instagram API is not granting access', video_id) else: - if self._get_cookies(url).get('sessionid'): - media.update(traverse_obj(self._download_json( - f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, - fatal=False, note='Downloading video info', headers={ - **self._API_HEADERS, - 'X-CSRFToken': csrf_token.value, - }), ('items', 0)) or {}) - if media: - return self._extract_product(media) + csrf_token = csrf_token.value if api_check.get('status') == 'ok' else None + if not csrf_token: + self.report_warning('Instagram API is not granting access', video_id) - variables = { - 'shortcode': video_id, - 'child_comment_count': 3, - 'fetch_comment_count': 40, - 'parent_comment_count': 24, - 'has_threaded_comments': True, - } - general_info = self._download_json( - 'https://www.instagram.com/graphql/query/', video_id, fatal=False, - headers={ - **self._API_HEADERS, - 'X-CSRFToken': csrf_token.value, - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': url, - }, query={ - 'query_hash': '9f8827793ef34641b2fb195d4d41151c', - 'variables': json.dumps(variables, separators=(',', ':')), - }) - media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) + variables = { + 'shortcode': video_id, + 'child_comment_count': 3, + 'fetch_comment_count': 40, + 'parent_comment_count': 24, + 'has_threaded_comments': True, + } + general_info = self._download_json( + 'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False, + headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token or '', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }, query={ + 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'variables': json.dumps(variables, separators=(',', ':')), + }) + media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) - if not media: + if not general_info: self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) webpage, urlh = self._download_webpage_handle(url, video_id) shared_data = self._search_json( @@ -418,12 +448,12 @@ def _real_extract(self, url): shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) else: - self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage') + self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') webpage = self._download_webpage( f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) additional_data = self._search_json( r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) - if not additional_data: + if not additional_data and not media: self.raise_login_required('Requested content is not available, rate-limit reached or login required') product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) From 32972518da55934f7ccf7960f788363d5700da5e Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 23 Sep 2022 12:10:35 +1200 Subject: [PATCH 170/284] [extractor/telegraaf] Use mobile GraphQL API endpoint Workaround for Cloudflare 403 Fixes https://github.com/yt-dlp/yt-dlp/issues/5000 Authored by: coletdjnz --- yt_dlp/extractor/telegraaf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/telegraaf.py b/yt_dlp/extractor/telegraaf.py index bc9a8d6084..6562d122cd 100644 --- a/yt_dlp/extractor/telegraaf.py +++ b/yt_dlp/extractor/telegraaf.py @@ -31,7 +31,9 @@ def _real_extract(self, url): article_id = self._match_id(url) video_id = self._download_json( - 'https://www.telegraaf.nl/graphql', article_id, query={ + 'https://app.telegraaf.nl/graphql', article_id, + headers={'User-Agent': 'De Telegraaf/6.8.11 (Android 11; en_US)'}, + query={ 'query': '''{ article(uid: %s) { videos { From f55523cfdd18dcd578f5d96cbb06266663169d35 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Fri, 23 Sep 2022 19:21:07 +0530 Subject: [PATCH 171/284] [utils] `js_to_json`: Improve Closes #4900 --- yt_dlp/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 26ef3c7dd1..f6ab9905d4 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3298,7 +3298,7 @@ def fix_kv(m): return '"%d":' % i if v.endswith(':') else '%d' % i if v in vars: - return vars[v] + return json.dumps(vars[v]) if strict: raise ValueError(f'Unknown value: {v}') @@ -3310,6 +3310,7 @@ def create_map(mobj): code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) if not strict: code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| From 3c757d5ed2527b17881eb65c67ddbe0d1335771f Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 23 Sep 2022 21:52:11 +0000 Subject: [PATCH 172/284] [extractor/wistia] Add support for channels (#4819) Fixes https://github.com/yt-dlp/yt-dlp/issues/4748 Related: https://github.com/yt-dlp/yt-dlp/issues/4985 Authored by: coletdjnz --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/generic.py | 30 ---- yt_dlp/extractor/wistia.py | 239 +++++++++++++++++++++++++++----- 3 files changed, 202 insertions(+), 68 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e247871361..c2575bc928 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2142,6 +2142,7 @@ from .wistia import ( WistiaIE, WistiaPlaylistIE, + WistiaChannelIE, ) from .worldstarhiphop import WorldStarHipHopIE from .wppilot import ( diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index fadc0819b0..672034c6d7 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -873,24 +873,6 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': 'b9676d24bf30945d97060638fbfe77f0', - 'info_dict': { - 'id': '5vd7p4bct5', - 'ext': 'bin', - 'title': 'md5:db27290a04ae306319b0b5cce3cdf7bd', - 'description': 'md5:e835b7808e11aaef29ccdc28888437af', - 'duration': 623.019, - 'uploader': 'study.com', - 'timestamp': 1663258727, - 'upload_date': '20220915', - 'filesize': 29798093, - 'age_limit': 0, - 'thumbnail': r're:^https?://.+\.jpg$', - }, - }, # Wistia standard embed (async) { 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', @@ -908,18 +890,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'webpage 404 not found', }, - # Wistia embed with video IDs in query - { - 'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt', - 'info_dict': { - 'id': 'md5:922795280019b3a70ca133330a4b0108', - 'title': 'Amplify Sessions - Amplitude', - 'description': 'md5:3d271bdee219417bb1c35eeb0937b923', - 'age_limit': 0, - 'thumbnail': r're:^https?://.+\.jpg$', - }, - 'playlist_count': 3, - }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index ba7497493f..e1e5855c26 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -1,30 +1,36 @@ import re +import urllib.error +import urllib.parse +from base64 import b64decode from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, - try_call, + parse_qs, + traverse_obj, try_get, + update_url_query, ) class WistiaBaseIE(InfoExtractor): _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})' _VALID_URL_BASE = r'https?://(?:\w+\.)?wistia\.(?:net|com)/(?:embed/)?' - _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' + _EMBED_BASE_URL = 'http://fast.wistia.net/embed/' def _download_embed_config(self, config_type, config_id, referer): - base_url = self._EMBED_BASE_URL + '%ss/%s' % (config_type, config_id) + base_url = self._EMBED_BASE_URL + '%s/%s' % (config_type, config_id) embed_config = self._download_json( base_url + '.json', config_id, headers={ 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. }) - if isinstance(embed_config, dict) and embed_config.get('error'): + error = traverse_obj(embed_config, 'error') + if error: raise ExtractorError( - 'Error while getting the playlist', expected=True) + f'Error while getting the playlist: {error}', expected=True) return embed_config @@ -114,10 +120,38 @@ def _extract_media(self, embed_config): 'subtitles': subtitles, } + @classmethod + def _extract_from_webpage(cls, url, webpage): + from .teachable import TeachableIE + + if list(TeachableIE._extract_embed_urls(url, webpage)): + return + + yield from super()._extract_from_webpage(url, webpage) + + @classmethod + def _extract_wistia_async_embed(cls, webpage): + # https://wistia.com/support/embed-and-share/video-on-your-website + # https://wistia.com/support/embed-and-share/channel-embeds + yield from re.finditer( + r'''(?sx) + <(?:div|section)[^>]+class=([\"'])(?:(?!\1).)*?(?P<type>wistia[a-z_0-9]+)\s*\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 + ''', webpage) + + @classmethod + def _extract_url_media_id(cls, url): + mobj = re.search(r'(?:wmediaid|wvideo(?:id)?)]?=(?P<id>[a-z0-9]{10})', urllib.parse.unquote_plus(url)) + if mobj: + return mobj.group('id') + class WistiaIE(WistiaBaseIE): _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) - _EMBED_REGEX = [r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})'] + _EMBED_REGEX = [ + r'''(?x) + <(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'] + (?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10}) + '''] _TESTS = [{ # with hls video 'url': 'wistia:807fafadvk', @@ -131,7 +165,20 @@ class WistiaIE(WistiaBaseIE): 'timestamp': 1463607249, 'duration': 4987.11, }, - 'skip': 'webpage 404 not found', + 'skip': 'video unavailable', + }, { + 'url': 'wistia:a6ndpko1wg', + 'md5': '10c1ce9c4dde638202513ed17a3767bd', + 'info_dict': { + 'id': 'a6ndpko1wg', + 'ext': 'bin', + 'title': 'Episode 2: Boxed Water\'s retention is thirsty', + 'upload_date': '20210324', + 'description': 'md5:da5994c2c2d254833b412469d9666b7a', + 'duration': 966.0, + 'timestamp': 1616614369, + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin', + } }, { 'url': 'wistia:5vd7p4bct5', 'md5': 'b9676d24bf30945d97060638fbfe77f0', @@ -159,41 +206,53 @@ class WistiaIE(WistiaBaseIE): 'only_matching': True, }] - # https://wistia.com/support/embed-and-share/video-on-your-website - @classmethod - def _extract_embed_urls(cls, url, webpage): - urls = list(super()._extract_embed_urls(url, webpage)) - - for match in re.finditer( - r'''(?sx) - <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 - ''', webpage): - urls.append('wistia:%s' % match.group('id')) - for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): - urls.append('wistia:%s' % match.group('id')) - for match in re.finditer(r'(?:wmediaid|wvideo(?:id)?)(?:%5D)?=(?P<id>[a-z0-9]{10})', url): - urls.append('wistia:%s' % match.group('id')) - return urls - - @classmethod - def _extract_from_webpage(cls, url, webpage): - from .teachable import TeachableIE - - if list(TeachableIE._extract_embed_urls(url, webpage)): - return - - for entry in super()._extract_from_webpage(url, webpage): - yield { - **entry, - '_type': 'url_transparent', - 'uploader': try_call(lambda: re.match(r'(?:https?://)?([^/]+)/', url).group(1)), - } + _WEBPAGE_TESTS = [{ + 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', + 'info_dict': { + 'id': 'cqwukac3z1', + 'ext': 'bin', + 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', + 'duration': 158.125, + 'timestamp': 1618974400, + 'description': 'md5:27abc99a758573560be72600ef95cece', + 'upload_date': '20210421', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin', + } + }, { + 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': 'b9676d24bf30945d97060638fbfe77f0', + 'info_dict': { + 'id': '5vd7p4bct5', + 'ext': 'bin', + 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', + 'upload_date': '20220915', + 'timestamp': 1663258727, + 'duration': 623.019, + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin', + 'description': 'a Paywall Videos video', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) - embed_config = self._download_embed_config('media', video_id, url) + embed_config = self._download_embed_config('medias', video_id, url) return self._extract_media(embed_config) + @classmethod + def _extract_embed_urls(cls, url, webpage): + urls = list(super()._extract_embed_urls(url, webpage)) + for match in cls._extract_wistia_async_embed(webpage): + if match.group('type') != 'wistia_channel': + urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', + webpage): + urls.append('wistia:%s' % match.group('id')) + if not WistiaChannelIE._extract_embed_urls(url, webpage): # Fallback + media_id = cls._extract_url_media_id(url) + if media_id: + urls.append('wistia:%s' % match.group('id')) + return urls + class WistiaPlaylistIE(WistiaBaseIE): _VALID_URL = r'%splaylists/%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) @@ -208,7 +267,7 @@ class WistiaPlaylistIE(WistiaBaseIE): def _real_extract(self, url): playlist_id = self._match_id(url) - playlist = self._download_embed_config('playlist', playlist_id, url) + playlist = self._download_embed_config('playlists', playlist_id, url) entries = [] for media in (try_get(playlist, lambda x: x[0]['medias']) or []): @@ -218,3 +277,107 @@ def _real_extract(self, url): entries.append(self._extract_media(embed_config)) return self.playlist_result(entries, playlist_id) + + +class WistiaChannelIE(WistiaBaseIE): + _VALID_URL = r'(?:wistiachannel:|%schannel/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) + + _TESTS = [{ + # JSON Embed API returns 403, should fall back to webpage + 'url': 'https://fast.wistia.net/embed/channel/yvyvu7wjbg?wchannelid=yvyvu7wjbg', + 'info_dict': { + 'id': 'yvyvu7wjbg', + 'title': 'Copysmith Tutorials and Education!', + 'description': 'Learn all things Copysmith via short and informative videos!' + }, + 'playlist_mincount': 7, + 'expected_warnings': ['falling back to webpage'], + }, { + 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l', + 'info_dict': { + 'id': '3802iirk0l', + 'title': 'The Roof', + }, + 'playlist_mincount': 20, + }, { + # link to popup video, follow --no-playlist + 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', + 'info_dict': { + 'id': 'sp5dqjzw3n', + 'ext': 'bin', + 'title': 'The Roof S2: The Modern CRO', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin', + 'duration': 86.487, + 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', + 'timestamp': 1619790290, + 'upload_date': '20210430', + }, + 'params': {'noplaylist': True, 'skip_download': True}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.profitwell.com/recur/boxed-out', + 'info_dict': { + 'id': '6jyvmqz6zs', + 'title': 'Boxed Out', + 'description': 'md5:14a8a93a1dbe236718e6a59f8c8c7bae', + }, + 'playlist_mincount': 30, + }, { + # section instead of div + 'url': 'https://360learning.com/studio/onboarding-joei/', + 'info_dict': { + 'id': 'z874k93n2o', + 'title': 'Onboarding Joei.', + 'description': 'Coming to you weekly starting Feb 19th.', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt', + 'info_dict': { + 'id': 'pz0m0l0if3', + 'title': 'A Framework for Improving Product Team Performance', + 'ext': 'bin', + 'timestamp': 1653935275, + 'upload_date': '20220530', + 'description': 'Learn how to help your company improve and achieve your product related goals.', + 'duration': 1854.39, + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin', + }, + 'params': {'noplaylist': True, 'skip_download': True}, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + media_id = self._extract_url_media_id(url) + if not self._yes_playlist(channel_id, media_id, playlist_label='channel'): + return self.url_result(f'wistia:{media_id}', 'Wistia') + + try: + data = self._download_embed_config('channel', channel_id, url) + except (ExtractorError, urllib.error.HTTPError): + # Some channels give a 403 from the JSON API + self.report_warning('Failed to download channel data from API, falling back to webpage.') + webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) + data = self._parse_json( + self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id), + channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8'))) + + # XXX: can there be more than one series? + series = traverse_obj(data, ('series', 0), default={}) + + entries = [ + self.url_result(f'wistia:{video["hashedId"]}', WistiaIE, title=video.get('name')) + for video in traverse_obj(series, ('sections', ..., 'videos', ...)) or [] + if video.get('hashedId') + ] + + return self.playlist_result( + entries, channel_id, playlist_title=series.get('title'), playlist_description=series.get('description')) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for match in cls._extract_wistia_async_embed(webpage): + if match.group('type') == 'wistia_channel': + # original url may contain wmediaid query param + yield update_url_query(f'wistiachannel:{match.group("id")}', parse_qs(url)) From d42763a443107fa6a9d69c110f92c98857ca2406 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Sat, 24 Sep 2022 17:42:32 +1200 Subject: [PATCH 173/284] [extractor/rutube] Fix `_EMBED_REGEX` Closes https://github.com/yt-dlp/yt-dlp/issues/4797 Authored by: coletdjnz --- yt_dlp/extractor/rutube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index 380c5e14e8..34af0d594b 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -93,7 +93,7 @@ class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' - _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1'] + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', From faf7863bb0898c4a7972cd77b12a619bbc79c914 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Sat, 24 Sep 2022 18:30:31 +0900 Subject: [PATCH 174/284] [extractor/Smotrim] Add extractor (#5015) Authored by: nikita-moor, Lesmiscore --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/smotrim.py | 65 +++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 yt_dlp/extractor/smotrim.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c2575bc928..f334b78330 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1619,6 +1619,7 @@ from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE +from .smotrim import SmotrimIE from .snotr import SnotrIE from .sohu import SohuIE from .sonyliv import ( diff --git a/yt_dlp/extractor/smotrim.py b/yt_dlp/extractor/smotrim.py new file mode 100644 index 0000000000..d3f1b695b3 --- /dev/null +++ b/yt_dlp/extractor/smotrim.py @@ -0,0 +1,65 @@ +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SmotrimIE(InfoExtractor): + _VALID_URL = r'https?://smotrim\.ru/(?P<type>brand|video|article|live)/(?P<id>[0-9]+)' + _TESTS = [{ # video + 'url': 'https://smotrim.ru/video/1539617', + 'md5': 'b1923a533c8cab09679789d720d0b1c5', + 'info_dict': { + 'id': '1539617', + 'ext': 'mp4', + 'title': 'Полиглот. Китайский с нуля за 16 часов! Урок №16', + 'description': '', + }, + 'add_ie': ['RUTV'], + }, { # article (geo-restricted? plays fine from the US and JP) + 'url': 'https://smotrim.ru/article/2813445', + 'md5': 'e0ac453952afbc6a2742e850b4dc8e77', + 'info_dict': { + 'id': '2431846', + 'ext': 'mp4', + 'title': 'Новости культуры. Съёмки первой программы "Большие и маленькие"', + 'description': 'md5:94a4a22472da4252bf5587a4ee441b99', + }, + 'add_ie': ['RUTV'], + }, { # brand, redirect + 'url': 'https://smotrim.ru/brand/64356', + 'md5': '740472999ccff81d7f6df79cecd91c18', + 'info_dict': { + 'id': '2354523', + 'ext': 'mp4', + 'title': 'Большие и маленькие. Лучшее. 4-й выпуск', + 'description': 'md5:84089e834429008371ea41ea3507b989', + }, + 'add_ie': ['RUTV'], + }, { # live + 'url': 'https://smotrim.ru/live/19201', + 'info_dict': { + 'id': '19201', + 'ext': 'mp4', + # this looks like a TV channel name + 'title': 'Россия Культура. Прямой эфир', + 'description': '', + }, + 'add_ie': ['RUTV'], + }] + + def _real_extract(self, url): + video_id, typ = self._match_valid_url(url).group('id', 'type') + rutv_type = 'video' + if typ not in ('video', 'live'): + webpage = self._download_webpage(url, video_id, f'Resolving {typ} link') + # there are two cases matching regex: + # 1. "embedUrl" in JSON LD (/brand/) + # 2. "src" attribute from iframe (/article/) + video_id = self._search_regex( + r'"https://player.smotrim.ru/iframe/video/id/(?P<video_id>\d+)/', + webpage, 'video_id', default=None) + if not video_id: + raise ExtractorError('There are no video in this page.', expected=True) + elif typ == 'live': + rutv_type = 'live' + + return self.url_result(f'https://player.vgtrk.com/iframe/{rutv_type}/id/{video_id}') From 5c8b2ee9ecf8773eb463b4ae218f8313a6626b2f Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Sat, 24 Sep 2022 18:30:58 +0900 Subject: [PATCH 175/284] [extractor/RUTV] Fix warnings for livestreams (#5016) Authored by: Lesmiscore --- yt_dlp/extractor/rutv.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py index 0b07dc5ad8..75da01f7d3 100644 --- a/yt_dlp/extractor/rutv.py +++ b/yt_dlp/extractor/rutv.py @@ -141,7 +141,7 @@ def _real_extract(self, url): if media['errors']: raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) - view_count = playlist.get('count_views') + view_count = int_or_none(playlist.get('count_views')) priority_transport = playlist['priority_transport'] thumbnail = media['picture'] @@ -152,6 +152,7 @@ def _real_extract(self, url): duration = int_or_none(media.get('duration')) formats = [] + subtitles = {} for transport, links in media['sources'].items(): for quality, url in links.items(): @@ -171,8 +172,10 @@ def _real_extract(self, url): 'vbr': str_to_int(quality), } elif transport == 'm3u8': - formats.extend(self._extract_m3u8_formats( - url, video_id, 'mp4', quality=preference, m3u8_id='hls')) + fmt, subs = self._extract_m3u8_formats_and_subtitles( + url, video_id, 'mp4', quality=preference, m3u8_id='hls') + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) continue else: fmt = { @@ -186,7 +189,7 @@ def _real_extract(self, url): }) formats.append(fmt) - self._sort_formats(formats) + self._sort_formats(formats, ('source', )) return { 'id': video_id, @@ -196,5 +199,6 @@ def _real_extract(self, url): 'view_count': view_count, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, 'is_live': is_live, } From 0bd5a039ea234374821510ac0371e03e87a6a57f Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 25 Sep 2022 23:27:13 +0530 Subject: [PATCH 176/284] Playlists maynot always have webpage_url --- yt_dlp/YoutubeDL.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 0bfc47767a..0d0a2ebe0d 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1687,8 +1687,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): elif result_type in ('playlist', 'multi_video'): # Protect from infinite recursion due to recursively nested playlists # (see https://github.com/ytdl-org/youtube-dl/issues/27833) - webpage_url = ie_result['webpage_url'] - if webpage_url in self._playlist_urls: + webpage_url = ie_result.get('webpage_url') # Playlists maynot have webpage_url + if webpage_url and webpage_url in self._playlist_urls: self.to_screen( '[download] Skipping already downloaded playlist: %s' % ie_result.get('title') or ie_result.get('id')) @@ -1742,14 +1742,17 @@ def _playlist_infodict(ie_result, strict=False, **kwargs): } if strict: return info + if ie_result.get('webpage_url'): + info.update({ + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), + }) return { **info, 'playlist_index': 0, '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)), 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'webpage_url_domain': get_domain(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } From ab029d7e9200a273d7204be68c0735b16971ff44 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Sun, 25 Sep 2022 23:03:19 +0200 Subject: [PATCH 177/284] [utils] `traverse_obj`: Rewrite, document and add tests (#5024) Authored by: Grub4K --- test/test_utils.py | 187 ++++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 247 ++++++++++++++++++++++++++------------------- 2 files changed, 332 insertions(+), 102 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 96477c53fc..69313564a1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -109,6 +109,7 @@ strip_or_none, subtitles_filename, timeconvert, + traverse_obj, unescapeHTML, unified_strdate, unified_timestamp, @@ -1874,6 +1875,192 @@ def test_get_compatible_ext(self): self.assertEqual(get_compatible_ext( vcodecs=['av1'], acodecs=['mp4a'], vexts=['webm'], aexts=['m4a'], preferences=('webm', 'mkv')), 'mkv') + def test_traverse_obj(self): + _TEST_DATA = { + 100: 100, + 1.2: 1.2, + 'str': 'str', + 'None': None, + '...': ..., + 'urls': [ + {'index': 0, 'url': 'https://www.example.com/0'}, + {'index': 1, 'url': 'https://www.example.com/1'}, + ], + 'data': ( + {'index': 2}, + {'index': 3}, + ), + } + + # Test base functionality + self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str', + msg='allow tuple path') + self.assertEqual(traverse_obj(_TEST_DATA, ['str']), 'str', + msg='allow list path') + self.assertEqual(traverse_obj(_TEST_DATA, (value for value in ("str",))), 'str', + msg='allow iterable path') + self.assertEqual(traverse_obj(_TEST_DATA, 'str'), 'str', + msg='single items should be treated as a path') + self.assertEqual(traverse_obj(_TEST_DATA, None), _TEST_DATA) + self.assertEqual(traverse_obj(_TEST_DATA, 100), 100) + self.assertEqual(traverse_obj(_TEST_DATA, 1.2), 1.2) + + # Test Ellipsis behavior + self.assertCountEqual(traverse_obj(_TEST_DATA, ...), + (item for item in _TEST_DATA.values() if item is not None), + msg='`...` should give all values except `None`') + self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, ...)), _TEST_DATA['urls'][0].values(), + msg='`...` selection for dicts should select all values') + self.assertEqual(traverse_obj(_TEST_DATA, (..., ..., 'url')), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='nested `...` queries should work') + self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4), + msg='`...` query result should be flattened') + + # Test function as key + self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), + [_TEST_DATA['urls']], + msg='function as query key should perform a filter based on (key, value)') + self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, + msg='exceptions in the query function should be catched') + + # Test alternative paths + self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', + msg='multiple `path_list` should be treated as alternative paths') + self.assertEqual(traverse_obj(_TEST_DATA, 'str', 100), 'str', + msg='alternatives should exit early') + self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'fail'), None, + msg='alternatives should return `default` if exhausted') + + # Test branch and path nesting + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')), ['https://www.example.com/0'], + msg='tuple as key should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')), ['https://www.example.com/0'], + msg='list as key should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))), ['https://www.example.com/0'], + msg='double nesting in path should be treated as paths') + self.assertEqual(traverse_obj(['0', [1, 2]], [(0, 1), 0]), [1], + msg='do not fail early on branching') + self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', ((1, ('fail', 'url')), (0, 'url')))), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='tripple nesting in path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ('fail', (..., 'url')))), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='ellipsis as branch path start gets flattened') + + # Test dictionary as key + self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}), {0: 100, 1: 1.2}, + msg='dict key should result in a dict with the same keys') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}), + {0: 'https://www.example.com/0'}, + msg='dict key should allow paths') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}), + {0: ['https://www.example.com/0']}, + msg='tuple in dict path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}), + {0: ['https://www.example.com/0']}, + msg='double nesting in dict path should be treated as paths') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}), + {0: ['https://www.example.com/1', 'https://www.example.com/0']}, + msg='tripple nesting in dict path should be treated as branches') + self.assertEqual(traverse_obj({}, {0: 1}, default=...), {0: ...}, + msg='do not remove `None` values when dict key') + + # Testing default parameter behavior + _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail'), None, + msg='default value should be `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=...), ..., + msg='chained fails should result in default') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', 'int'), 0, + msg='should not short cirquit on `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', default=1), 1, + msg='invalid dict key should result in `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', default=1), 1, + msg='`None` is a deliberate sentinel and should become `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', 10)), None, + msg='`IndexError` should result in `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=1), 1, + msg='if branched but not successfull return `default`, not `[]`') + + # Testing expected_type behavior + _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str), 'str', + msg='accept matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), None, + msg='reject non matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)), '0', + msg='transform type using type function') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', + expected_type=lambda _: 1 / 0), None, + msg='wrap expected_type fuction in try_call') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str), ['str'], + msg='eliminate items that expected_type fails on') + + # Test get_all behavior + _GET_ALL_DATA = {'key': [0, 1, 2]} + self.assertEqual(traverse_obj(_GET_ALL_DATA, ('key', ...), get_all=False), 0, + msg='if not `get_all`, return only first matching value') + self.assertEqual(traverse_obj(_GET_ALL_DATA, ..., get_all=False), [0, 1, 2], + msg='do not overflatten if not `get_all`') + + # Test casesense behavior + _CASESENSE_DATA = { + 'KeY': 'value0', + 0: { + 'KeY': 'value1', + 0: {'KeY': 'value2'}, + }, + } + self.assertEqual(traverse_obj(_CASESENSE_DATA, 'key'), None, + msg='dict keys should be case sensitive unless `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, 'keY', + casesense=False), 'value0', + msg='allow non matching key case if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ('keY',)), + casesense=False), ['value1'], + msg='allow non matching key case in branch if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ((0, 'keY'),)), + casesense=False), ['value2'], + msg='allow non matching key case in branch path if `casesense`') + + # Test traverse_string behavior + _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)), None, + msg='do not traverse into string if not `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), + traverse_string=True), 's', + msg='traverse into string if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), + traverse_string=True), '.', + msg='traverse into converted data if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', ...), + traverse_string=True), list('str'), + msg='`...` branching into string should result in list') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), + traverse_string=True), ['s', 'r'], + msg='branching into string should result in list') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda _, x: x), + traverse_string=True), list('str'), + msg='function branching into string should result in list') + + # Test is_user_input behavior + _IS_USER_INPUT_DATA = {'range8': list(range(8))} + self.assertEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3'), + is_user_input=True), 3, + msg='allow for string indexing if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3:'), + is_user_input=True), tuple(range(8))[3:], + msg='allow for string slice if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':4:2'), + is_user_input=True), tuple(range(8))[:4:2], + msg='allow step in string slice if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':'), + is_user_input=True), range(8), + msg='`:` should be treated as `...` if `is_user_input`') + with self.assertRaises(TypeError, msg='too many params should result in error'): + traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':::'), is_user_input=True) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index f6ab9905d4..bc100c9c32 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5,6 +5,7 @@ import calendar import codecs import collections +import collections.abc import contextlib import datetime import email.header @@ -3189,7 +3190,7 @@ def try_call(*funcs, expected_type=None, args=[], kwargs={}): for f in funcs: try: val = f(*args, **kwargs) - except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError): + except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError): pass else: if expected_type is None or isinstance(val, expected_type): @@ -5285,107 +5286,149 @@ def load_plugins(name, suffix, namespace): def traverse_obj( - obj, *path_list, default=None, expected_type=None, get_all=True, + obj, *paths, default=None, expected_type=None, get_all=True, casesense=True, is_user_input=False, traverse_string=False): - ''' Traverse nested list/dict/tuple - @param path_list A list of paths which are checked one by one. - Each path is a list of keys where each key is a: - - None: Do nothing - - string: A dictionary key / regex group - - int: An index into a list - - tuple: A list of keys all of which will be traversed - - Ellipsis: Fetch all values in the object - - Function: Takes the key and value as arguments - and returns whether the key matches or not - @param default Default value to return - @param expected_type Only accept final value of this type (Can also be any callable) - @param get_all Return all the values obtained from a path or only the first one - @param casesense Whether to consider dictionary keys as case sensitive + """ + Safely traverse nested `dict`s and `Sequence`s - The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" - @param path_list In addition to the above, - - dict: Given {k:v, ...}; return {k: traverse_obj(obj, v), ...} - @param is_user_input Whether the keys are generated from user input. If True, - strings are converted to int/slice if necessary - @param traverse_string Whether to traverse inside strings. If True, any - non-compatible object will also be converted into a string - ''' # TODO: Write tests - if not casesense: - _lower = lambda k: (k.lower() if isinstance(k, str) else k) - path_list = (map(_lower, variadic(path)) for path in path_list) + Each of the provided `paths` is tested and the first producing a valid result will be returned. + A value of None is treated as the absence of a value. - def _traverse_obj(obj, path, _current_depth=0): - nonlocal depth - path = tuple(variadic(path)) - for i, key in enumerate(path): - if None in (key, obj): - return obj - if isinstance(key, (list, tuple)): - obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key] - key = ... + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. - if key is ...: - obj = (obj.values() if isinstance(obj, dict) - else obj if isinstance(obj, (list, tuple, LazyList)) - else str(obj) if traverse_string else []) - _current_depth += 1 - depth = max(depth, _current_depth) - return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj] - elif isinstance(key, dict): - obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()}) - elif callable(key): - if isinstance(obj, (list, tuple, LazyList)): - obj = enumerate(obj) - elif isinstance(obj, dict): - obj = obj.items() - else: - if not traverse_string: - return None - obj = str(obj) - _current_depth += 1 - depth = max(depth, _current_depth) - return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))] - elif isinstance(obj, dict) and not (is_user_input and key == ':'): - obj = (obj.get(key) if casesense or (key in obj) - else next((v for k, v in obj.items() if _lower(k) == key), None)) - else: - if is_user_input: - key = (int_or_none(key) if ':' not in key - else slice(*map(int_or_none, key.split(':')))) - if key == slice(None): - return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth) - if not isinstance(key, (int, slice)): - return None - if not isinstance(obj, (list, tuple, LazyList)): - if not traverse_string: - return None - obj = str(obj) - try: - obj = obj[key] - except IndexError: - return None - return obj + The keys in the path can be one of: + - `None`: Return the current object. + - `str`/`int`: Return `obj[key]`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Sequence`s, `key` is the index of the value. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches + + @params paths Paths which to traverse by. + @param default Value to return if the paths do not match. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + """ + is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes)) + casefold = lambda k: k.casefold() if isinstance(k, str) else k if isinstance(expected_type, type): type_test = lambda val: val if isinstance(val, expected_type) else None else: - type_test = expected_type or IDENTITY + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) - for path in path_list: - depth = 0 - val = _traverse_obj(obj, path) - if val is not None: - if depth: - for _ in range(depth - 1): - val = itertools.chain.from_iterable(v for v in val if v is not None) - val = [v for v in map(type_test, val) if v is not None] - if val: - return val if get_all else val[0] + def apply_key(key, obj): + if obj is None: + return + + elif key is None: + yield obj + + elif isinstance(key, (list, tuple)): + for branch in key: + _, result = apply_path(obj, branch) + yield from result + + elif key is ...: + if isinstance(obj, collections.abc.Mapping): + yield from obj.values() + elif is_sequence(obj): + yield from obj + elif traverse_string: + yield from str(obj) + + elif callable(key): + if is_sequence(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, collections.abc.Mapping): + iter_obj = obj.items() + elif traverse_string: + iter_obj = enumerate(str(obj)) else: - val = type_test(val) - if val is not None: - return val + return + yield from (v for k, v in iter_obj if try_call(key, args=(k, v))) + + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items()) + yield {k: v if v is not None else default for k, v in iter_obj + if v is not None or default is not None} + + elif isinstance(obj, dict): + yield (obj.get(key) if casesense or (key in obj) + else next((v for k, v in obj.items() if casefold(k) == key), None)) + + else: + if is_user_input: + key = (int_or_none(key) if ':' not in key + else slice(*map(int_or_none, key.split(':')))) + + if not isinstance(key, (int, slice)): + return + + if not is_sequence(obj): + if not traverse_string: + return + obj = str(obj) + + with contextlib.suppress(IndexError): + yield obj[key] + + def apply_path(start_obj, path): + objs = (start_obj,) + has_branched = False + + for key in variadic(path): + if is_user_input and key == ':': + key = ... + + if not casesense and isinstance(key, str): + key = key.casefold() + + if key is ... or isinstance(key, (list, tuple)) or callable(key): + has_branched = True + + key_func = functools.partial(apply_key, key) + objs = itertools.chain.from_iterable(map(key_func, objs)) + + return has_branched, objs + + def _traverse_obj(obj, path): + has_branched, results = apply_path(obj, path) + results = LazyList(x for x in map(type_test, results) if x is not None) + if results: + return results.exhaust() if get_all and has_branched else results[0] + + for path in paths: + result = _traverse_obj(obj, path) + if result is not None: + return result + return default @@ -5437,7 +5480,7 @@ def jwt_decode_hs256(jwt): WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None -@functools.cache +@ functools.cache def supports_terminal_sequences(stream): if compat_os_name == 'nt': if not WINDOWS_VT_MODE: @@ -5587,7 +5630,7 @@ def __str__(self): *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs), delim='\n') - @staticmethod + @ staticmethod def read_file(filename, default=[]): try: optionf = open(filename, 'rb') @@ -5608,7 +5651,7 @@ def read_file(filename, default=[]): optionf.close() return res - @staticmethod + @ staticmethod def hide_login_info(opts): PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'} eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') @@ -5632,7 +5675,7 @@ def append_config(self, *args, label=None): if config.init(*args): self.configs.append(config) - @property + @ property def all_args(self): for config in reversed(self.configs): yield from config.all_args @@ -5679,7 +5722,7 @@ def __exit__(self, type, value, traceback): # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class - @staticmethod + @ staticmethod def run_with_loop(main, loop): if not asyncio.iscoroutine(main): raise ValueError(f'a coroutine was expected, got {main!r}') @@ -5691,7 +5734,7 @@ def run_with_loop(main, loop): if hasattr(loop, 'shutdown_default_executor'): loop.run_until_complete(loop.shutdown_default_executor()) - @staticmethod + @ staticmethod def _cancel_all_tasks(loop): to_cancel = asyncio.all_tasks(loop) @@ -5725,7 +5768,7 @@ def cached_method(f): """Cache a method""" signature = inspect.signature(f) - @functools.wraps(f) + @ functools.wraps(f) def wrapper(self, *args, **kwargs): bound_args = signature.bind(self, *args, **kwargs) bound_args.apply_defaults() @@ -5757,7 +5800,7 @@ class Namespace(types.SimpleNamespace): def __iter__(self): return iter(self.__dict__.values()) - @property + @ property def items_(self): return self.__dict__.items() @@ -5796,13 +5839,13 @@ def __init__(self, _retries, _error_callback, **kwargs): def _should_retry(self): return self._error is not NO_DEFAULT and self.attempt <= self.retries - @property + @ property def error(self): if self._error is NO_DEFAULT: return None return self._error - @error.setter + @ error.setter def error(self, value): self._error = value @@ -5814,7 +5857,7 @@ def __iter__(self): if self.error: self.error_callback(self.error, self.attempt, self.retries) - @staticmethod + @ staticmethod def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None): """Utility function for reporting retries""" if count > retries: From 914491b8e087d21b8a1714eb185008c29b6fe1e8 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Sep 2022 02:52:21 +0530 Subject: [PATCH 178/284] [utils] `Popen.run`: Fix default return in binary mode --- yt_dlp/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index bc100c9c32..f935736922 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -891,8 +891,9 @@ def kill(self, *, timeout=0): @classmethod def run(cls, *args, timeout=None, **kwargs): with cls(*args, **kwargs) as proc: + default = '' if proc.text_mode else b'' stdout, stderr = proc.communicate_or_kill(timeout=timeout) - return stdout or '', stderr or '', proc.returncode + return stdout or default, stderr or default, proc.returncode def get_subprocess_encoding(): From 46a5b335e708c81bb6e9eb8cef0c13c72c497f0a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Sep 2022 02:53:08 +0530 Subject: [PATCH 179/284] [cookies] Let `_get_mac_keyring_password` fail gracefully Closes #4915 --- yt_dlp/cookies.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 24a8250dab..3032d07122 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -845,12 +845,15 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): def _get_mac_keyring_password(browser_keyring_name, logger): logger.debug('using find-generic-password to obtain password from OSX keychain') try: - stdout, _, _ = Popen.run( + stdout, _, returncode = Popen.run( ['security', 'find-generic-password', '-w', # write password to stdout '-a', browser_keyring_name, # match 'account' '-s', f'{browser_keyring_name} Safe Storage'], # match 'service' stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + if returncode: + logger.warning('find-generic-password failed') + return None return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running find-generic-password: {error_to_str(e)}') From 0500ee3d81c5d31500d7093512deee2b0ff8aacd Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 26 Sep 2022 03:03:52 +0530 Subject: [PATCH 180/284] Don't download entire video when no matching `--download-sections` --- yt_dlp/YoutubeDL.py | 11 ++++------- yt_dlp/utils.py | 3 +++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 0d0a2ebe0d..7b0616cba8 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2700,24 +2700,21 @@ def is_wellformed(f): # Process what we can, even without any available formats. formats_to_download = [{}] - requested_ranges = self.params.get('download_ranges') - if requested_ranges: - requested_ranges = tuple(requested_ranges(info_dict, self)) - + requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self)) best_format, downloaded_formats = formats_to_download[-1], [] if download: - if best_format: + if best_format and requested_ranges: def to_screen(*msg): self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}') to_screen(f'Downloading {len(formats_to_download)} format(s):', (f['format_id'] for f in formats_to_download)) - if requested_ranges: + if requested_ranges != ({}, ): to_screen(f'Downloading {len(requested_ranges)} time ranges:', (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges)) max_downloads_reached = False - for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]): + for fmt, chapter in itertools.product(formats_to_download, requested_ranges): new_info = self._copy_infodict(info_dict) new_info.update(fmt) offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index f935736922..d655bfdd03 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3793,6 +3793,9 @@ def __init__(self, chapters, ranges): self.chapters, self.ranges = chapters, ranges def __call__(self, info_dict, ydl): + if not self.ranges and not self.chapters: + yield {} + warning = ('There are no chapters matching the regex' if info_dict.get('chapters') else 'Cannot match chapters since chapter information is unavailable') for regex in self.chapters or []: From 0ca0f88121db5a1e9c223077af1b78c62d5ead6d Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Mon, 26 Sep 2022 00:58:06 +0000 Subject: [PATCH 181/284] [extractor/heise] Fix extractor (#5029) Fixes https://github.com/yt-dlp/yt-dlp/issues/1520 Authored by: coletdjnz --- yt_dlp/extractor/heise.py | 67 +++++++++++++++++++++++++++++-------- yt_dlp/extractor/youtube.py | 2 +- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/heise.py b/yt_dlp/extractor/heise.py index 4f689c6e49..86661d75a1 100644 --- a/yt_dlp/extractor/heise.py +++ b/yt_dlp/extractor/heise.py @@ -1,10 +1,12 @@ +import urllib.parse + from .common import InfoExtractor from .kaltura import KalturaIE from .youtube import YoutubeIE from ..utils import ( + NO_DEFAULT, determine_ext, int_or_none, - NO_DEFAULT, parse_iso8601, smuggle_url, xpath_text, @@ -23,6 +25,9 @@ class HeiseIE(InfoExtractor): 'timestamp': 1512734959, 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'duration': 2845, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -34,11 +39,27 @@ class HeiseIE(InfoExtractor): 'info_dict': { 'id': '6kmWbXleKW4', 'ext': 'mp4', - 'title': 'NEU IM SEPTEMBER | Netflix', - 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'title': 'Neu im September 2017 | Netflix', + 'description': 'md5:d6852d1f96bb80760608eed3b907437c', 'upload_date': '20170830', 'uploader': 'Netflix Deutschland, Österreich und Schweiz', 'uploader_id': 'netflixdach', + 'categories': ['Entertainment'], + 'tags': 'count:27', + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_id': 'UCZqgRlLcvO3Fnx_npQJygcQ', + 'thumbnail': 'https://i.ytimg.com/vi_webp/6kmWbXleKW4/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/netflixdach', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCZqgRlLcvO3Fnx_npQJygcQ', + 'view_count': int, + 'channel': 'Netflix Deutschland, Österreich und Schweiz', + 'channel_follower_count': int, + 'like_count': int, + 'duration': 67, }, 'params': { 'skip_download': True, @@ -52,11 +73,15 @@ class HeiseIE(InfoExtractor): 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 'timestamp': 1512470717, 'upload_date': '20171205', + 'duration': 786, + 'view_count': int, + 'thumbnail': 're:^https?://.*/thumbnail/.*', }, 'params': { 'skip_download': True, }, }, { + # FIXME: Video m3u8 fails to download; issue with Kaltura extractor 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', 'info_dict': { 'id': '1_59mk80sf', @@ -69,6 +94,18 @@ class HeiseIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # videout + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-3-8-Anonyme-SIM-Karten-G-Sync-Monitore-Citizenfour-2440327.html', + 'info_dict': { + 'id': '2440327', + 'ext': 'mp4', + 'title': 'c\'t uplink 3.8: Anonyme SIM-Karten, G-Sync-Monitore, Citizenfour', + 'thumbnail': 'http://www.heise.de/imagine/yxM2qmol0xV3iFB7qFb70dGvXjc/gallery/', + 'description': 'md5:fa164d8c8707dff124a9626d39205f5d', + 'timestamp': 1414825200, + 'upload_date': '20141101', + } }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -127,20 +164,22 @@ def _make_kaltura_result(kaltura_url): yt_urls, video_id, title, ie=YoutubeIE.ie_key()) title = extract_title() + api_params = urllib.parse.parse_qs( + self._search_regex(r'/videout/feed\.json\?([^\']+)', webpage, 'feed params', default=None) or '') + if not api_params or 'container' not in api_params or 'sequenz' not in api_params: + container_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', + webpage, 'container ID') - container_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', - webpage, 'container ID') - - sequenz_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', - webpage, 'sequenz ID') - - doc = self._download_xml( - 'http://www.heise.de/videout/feed', video_id, query={ + sequenz_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', + webpage, 'sequenz ID') + api_params = { 'container': container_id, 'sequenz': sequenz_id, - }) + } + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query=api_params) formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2afb993d01..83be162c9e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1009,7 +1009,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _EMBED_REGEX = [ r'''(?x) (?: - <iframe[^>]+?src=| + <(?:[0-9A-Za-z-]+?)?iframe[^>]+?src=| data-video-url=| <embed[^>]+?src=| embedSWF\(?:\s*| From 1534aba8658294913d58accbc6688574c9911585 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:43:54 +0200 Subject: [PATCH 182/284] [extractor/artetv] Remove duplicate stream urls (#5047) Closes #4510 Authored by: Grub4K --- yt_dlp/extractor/arte.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 25ecb42301..d3ec4a66c8 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -135,6 +135,7 @@ def _real_extract(self, url): 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True) formats, subtitles = [], {} + secondary_formats = [] for stream in config['data']['attributes']['streams']: # official player contains code like `e.get("versions")[0].eStat.ml5` stream_version = stream['versions'][0] @@ -152,22 +153,26 @@ def _real_extract(self, url): not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles ))) + short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') if stream['protocol'].startswith('HLS'): fmts, subs = self._extract_m3u8_formats_and_subtitles( stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) for fmt in fmts: fmt.update({ - 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]', + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', 'language_preference': lang_pref, }) - formats.extend(fmts) + if any(map(short_label.startswith, ('cc', 'OGsub'))): + secondary_formats.extend(fmts) + else: + formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif stream['protocol'] in ('HTTPS', 'RTMP'): formats.append({ 'format_id': f'{stream["protocol"]}-{stream_version_code}', 'url': stream['url'], - 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]', + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', 'language_preference': lang_pref, # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS }) @@ -179,6 +184,8 @@ def _real_extract(self, url): # The JS also looks for chapters in config['data']['attributes']['chapters'], # but I am yet to find a video having those + formats.extend(secondary_formats) + self._remove_duplicate_formats(formats) self._sort_formats(formats) metadata = config['data']['attributes']['metadata'] From 0f60ba6e656516ec24d619d20d61249be6296105 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 27 Sep 2022 02:30:50 +0530 Subject: [PATCH 183/284] [extractor] Improve json+ld extraction Related #5035 --- yt_dlp/extractor/common.py | 11 +++++++++-- yt_dlp/extractor/generic.py | 2 +- yt_dlp/utils.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 87660bb23b..d36f025ab8 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1536,10 +1536,10 @@ def extract_chapter_information(e): info['chapters'] = chapters def extract_video_object(e): - assert is_type(e, 'VideoObject') author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), + 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'thumbnails': [{'url': unescapeHTML(url)} @@ -1552,12 +1552,19 @@ def extract_video_object(e): # however some websites are using 'Text' type instead. # 1. https://schema.org/VideoObject 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, + 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), 'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), + 'tags': try_call(lambda: e.get('keywords').split(',')), }) + if is_type(e, 'AudioObject'): + info.update({ + 'vcodec': 'none', + 'abr': int_or_none(e.get('bitrate')), + }) extract_interaction_statistic(e) extract_chapter_information(e) @@ -1608,7 +1615,7 @@ def traverse_json_ld(json_ld, at_top_level=True): extract_video_object(e['video'][0]) elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): extract_video_object(e['subjectOf'][0]) - elif is_type(e, 'VideoObject'): + elif is_type(e, 'VideoObject', 'AudioObject'): extract_video_object(e) if expected_type is None: continue diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 672034c6d7..73aefc7829 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2910,7 +2910,7 @@ def _real_extract(self, url): if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') return merge_dicts({ - '_type': 'url_transparent', + '_type': 'video' if json_ld.get('ext') else 'url_transparent', 'url': smuggle_url(json_ld['url'], { 'force_videoid': video_id, 'to_generic': True, diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d655bfdd03..724e34ef7d 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -232,7 +232,7 @@ def random_user_agent(): ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>' +JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>' NUMBER_RE = r'\d+(?:\.\d+)?' From 0a5095fe8d9e944e3832be8125fbb3133500f9cc Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 27 Sep 2022 03:55:58 +0530 Subject: [PATCH 184/284] [extractor/youtube:tab] Support `reporthistory` page Closes #4929 --- yt_dlp/extractor/youtube.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 83be162c9e..5760e96f5f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -292,7 +292,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|' - r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') + r'storefront|oops|index|account|t/terms|about|upload|signin|logout') _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' @@ -673,7 +673,7 @@ def _extract_continuation(cls, renderer): return next_continuation contents = [] - for key in ('contents', 'items'): + for key in ('contents', 'items', 'rows'): contents.extend(try_get(renderer, lambda x: x[key], list) or []) for content in contents: @@ -4405,6 +4405,13 @@ def _rich_grid_entries(self, contents): yield entry ''' + def _report_history_entries(self, renderer): + for url in traverse_obj(renderer, ( + 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., + 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')): + yield self.url_result(urljoin('https://www.youtube.com', url), YoutubeIE) + def _extract_entries(self, parent_renderer, continuation_list): # continuation_list is modified in-place with continuation_list = [continuation_token] continuation_list[:] = [None] @@ -4416,12 +4423,16 @@ def _extract_entries(self, parent_renderer, continuation_list): content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', expected_type=dict) if not is_renderer: - renderer = content.get('richItemRenderer') - if renderer: - for entry in self._rich_entries(renderer): + if content.get('richItemRenderer'): + for entry in self._rich_entries(content['richItemRenderer']): yield entry continuation_list[0] = self._extract_continuation(parent_renderer) + elif content.get('reportHistorySectionRenderer'): # https://www.youtube.com/reporthistory + table = traverse_obj(content, ('reportHistorySectionRenderer', 'table', 'tableRenderer')) + yield from self._report_history_entries(table) + continuation_list[0] = self._extract_continuation(table) continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] for isr_content in isr_contents: if not isinstance(isr_content, dict): @@ -4510,7 +4521,8 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): 'playlistVideoRenderer': (self._playlist_entries, 'contents'), 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds 'richItemRenderer': (extract_entries, 'contents'), # for hashtag - 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents') + 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents'), + 'reportHistoryTableRowRenderer': (self._report_history_entries, 'rows'), } on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) continuation_items = try_get( From 1dd18a88087d92357c9a2d942ecc4d678ab04641 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 27 Sep 2022 04:19:12 +0530 Subject: [PATCH 185/284] [extractor/YoutubeShortsAudioPivot] Support `source` URLs `ytshortsap:` is no longer needed --- yt_dlp/extractor/youtube.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5760e96f5f..ededf8c756 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -291,7 +291,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' - r'browse|oembed|get_video_info|iframe_api|s/player|' + r'browse|oembed|get_video_info|iframe_api|s/player|source|' r'storefront|oops|index|account|t/terms|about|upload|signin|logout') _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' @@ -6330,14 +6330,11 @@ def _real_extract(self, url): class YoutubeShortsAudioPivotIE(InfoExtractor): - IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video); "ytshortsap:" prefix' + IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' IE_NAME = 'youtube:shorts:pivot:audio' - _VALID_URL = f'(?x)^ytshortsap:{YoutubeIE._VALID_URL[5:]}' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/source/(?P<id>[\w-]{11})/shorts' _TESTS = [{ - 'url': 'ytshortsap:https://www.youtube.com/shorts/Lyj-MZSAA9o?feature=share', - 'only_matching': True, - }, { - 'url': 'ytshortsap:Lyj-MZSAA9o', + 'url': 'https://www.youtube.com/source/Lyj-MZSAA9o/shorts', 'only_matching': True, }] From 1fb53b946c5aca3755bf72cc1c204925043b04f7 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 27 Sep 2022 04:44:54 +0530 Subject: [PATCH 186/284] [extractor/youtube:tab] Improve continuation items extraction --- yt_dlp/extractor/youtube.py | 47 ++++++++++++++----------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ededf8c756..c4aa6f8fe1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4493,26 +4493,6 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data - known_continuation_renderers = { - 'playlistVideoListContinuation': self._playlist_entries, - 'gridContinuation': self._grid_entries, - 'itemSectionContinuation': self._post_thread_continuation_entries, - 'sectionListContinuation': extract_entries, # for feeds - } - continuation_contents = try_get( - response, lambda x: x['continuationContents'], dict) or {} - continuation_renderer = None - for key, value in continuation_contents.items(): - if key not in known_continuation_renderers: - continue - continuation_renderer = value - continuation_list = [None] - yield from known_continuation_renderers[key](continuation_renderer) - continuation = continuation_list[0] or self._extract_continuation(continuation_renderer) - break - if continuation_renderer: - continue - known_renderers = { 'videoRenderer': (self._grid_entries, 'items'), # for membership tab 'gridPlaylistRenderer': (self._grid_entries, 'items'), @@ -4523,23 +4503,30 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): 'richItemRenderer': (extract_entries, 'contents'), # for hashtag 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents'), 'reportHistoryTableRowRenderer': (self._report_history_entries, 'rows'), + 'playlistVideoListContinuation': (self._playlist_entries, None), + 'gridContinuation': (self._grid_entries, None), + 'itemSectionContinuation': (self._post_thread_continuation_entries, None), + 'sectionListContinuation': (extract_entries, None), # for feeds } - on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) - continuation_items = try_get( - on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) - continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {} + + continuation_items = traverse_obj(response, ( + ('onResponseReceivedActions', 'onResponseReceivedEndpoints'), ..., + 'appendContinuationItemsAction', 'continuationItems' + ), 'continuationContents', get_all=False) + continuation_item = traverse_obj(continuation_items, 0, None, expected_type=dict, default={}) + video_items_renderer = None - for key, value in continuation_item.items(): + for key in continuation_item.keys(): if key not in known_renderers: continue - video_items_renderer = {known_renderers[key][1]: continuation_items} + func, parent_key = known_renderers[key] + video_items_renderer = {parent_key: continuation_items} if parent_key else continuation_items continuation_list = [None] - yield from known_renderers[key][0](video_items_renderer) + yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) + + if not video_items_renderer: break - if video_items_renderer: - continue - break @staticmethod def _extract_selected_tab(tabs, fatal=True): From 709ee214170cdb3e91f68062a07f52d1a24a8c89 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 27 Sep 2022 08:25:31 +0530 Subject: [PATCH 187/284] [extractor/youtube] Do not warn on duplicate chapters Eg: vYbaM8w8yzw --- yt_dlp/extractor/youtube.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c4aa6f8fe1..a9d838345d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3034,8 +3034,9 @@ def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, self.report_warning(f'Incomplete chapter {idx}') elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: chapters.append(chapter) - else: - self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"') + elif chapter not in chapters: + self.report_warning( + f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') return chapters[1:] def _extract_comment(self, comment_renderer, parent=None): From 7a32c70d13558977ec4e26900d6d4b0aa8614713 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 27 Sep 2022 08:32:57 +0530 Subject: [PATCH 188/284] [cleanup] Fix flake8 and minor refactor Issues from ab029d7e9200a273d7204be68c0735b16971ff44, 1fb53b946c5aca3755bf72cc1c204925043b04f7 --- yt_dlp/extractor/youtube.py | 27 +++++++++------------------ yt_dlp/utils.py | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a9d838345d..f73465ba4c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -30,6 +30,7 @@ clean_html, datetime_from_str, dict_get, + filter_dict, float_or_none, format_field, get_first, @@ -617,7 +618,7 @@ def generate_api_headers( if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin - return {h: v for h, v in headers.items() if v is not None} + return filter_dict(headers) def _download_ytcfg(self, client, video_id): url = { @@ -672,20 +673,10 @@ def _extract_continuation(cls, renderer): if next_continuation: return next_continuation - contents = [] - for key in ('contents', 'items', 'rows'): - contents.extend(try_get(renderer, lambda x: x[key], list) or []) - - for content in contents: - if not isinstance(content, dict): - continue - continuation_ep = try_get( - content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'], - lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']), - dict) - continuation = cls._extract_continuation_ep_data(continuation_ep) - if continuation: - return continuation + return traverse_obj(renderer, ( + ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')) + ), get_all=False, expected_type=cls._extract_continuation_ep_data) @classmethod def _extract_alerts(cls, data): @@ -4408,8 +4399,8 @@ def _rich_grid_entries(self, contents): def _report_history_entries(self, renderer): for url in traverse_obj(renderer, ( - 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., - 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., + 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., + 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')): yield self.url_result(urljoin('https://www.youtube.com', url), YoutubeIE) @@ -4553,7 +4544,7 @@ def _extract_uploader(self, data): uploader['uploader_url'] = urljoin( 'https://www.youtube.com/', try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], str)) - return {k: v for k, v in uploader.items() if v is not None} + return filter_dict(uploader) def _extract_from_tabs(self, item_id, ytcfg, data, tabs): playlist_id = title = description = channel_url = channel_name = channel_id = None diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 724e34ef7d..3e2ce84345 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5484,7 +5484,7 @@ def jwt_decode_hs256(jwt): WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None -@ functools.cache +@functools.cache def supports_terminal_sequences(stream): if compat_os_name == 'nt': if not WINDOWS_VT_MODE: @@ -5634,7 +5634,7 @@ def __str__(self): *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs), delim='\n') - @ staticmethod + @staticmethod def read_file(filename, default=[]): try: optionf = open(filename, 'rb') @@ -5655,7 +5655,7 @@ def read_file(filename, default=[]): optionf.close() return res - @ staticmethod + @staticmethod def hide_login_info(opts): PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'} eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') @@ -5679,7 +5679,7 @@ def append_config(self, *args, label=None): if config.init(*args): self.configs.append(config) - @ property + @property def all_args(self): for config in reversed(self.configs): yield from config.all_args @@ -5726,7 +5726,7 @@ def __exit__(self, type, value, traceback): # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class - @ staticmethod + @staticmethod def run_with_loop(main, loop): if not asyncio.iscoroutine(main): raise ValueError(f'a coroutine was expected, got {main!r}') @@ -5738,7 +5738,7 @@ def run_with_loop(main, loop): if hasattr(loop, 'shutdown_default_executor'): loop.run_until_complete(loop.shutdown_default_executor()) - @ staticmethod + @staticmethod def _cancel_all_tasks(loop): to_cancel = asyncio.all_tasks(loop) @@ -5772,7 +5772,7 @@ def cached_method(f): """Cache a method""" signature = inspect.signature(f) - @ functools.wraps(f) + @functools.wraps(f) def wrapper(self, *args, **kwargs): bound_args = signature.bind(self, *args, **kwargs) bound_args.apply_defaults() @@ -5804,7 +5804,7 @@ class Namespace(types.SimpleNamespace): def __iter__(self): return iter(self.__dict__.values()) - @ property + @property def items_(self): return self.__dict__.items() @@ -5843,13 +5843,13 @@ def __init__(self, _retries, _error_callback, **kwargs): def _should_retry(self): return self._error is not NO_DEFAULT and self.attempt <= self.retries - @ property + @property def error(self): if self._error is NO_DEFAULT: return None return self._error - @ error.setter + @error.setter def error(self, value): self._error = value @@ -5861,7 +5861,7 @@ def __iter__(self): if self.error: self.error_callback(self.error, self.attempt, self.retries) - @ staticmethod + @staticmethod def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None): """Utility function for reporting retries""" if count > retries: From c04cc2e28e2a6c2e3384fb203796714d739ae42a Mon Sep 17 00:00:00 2001 From: Kyle Anthony Williams <kyle.anthony.williams2@gmail.com> Date: Tue, 27 Sep 2022 10:22:06 -0400 Subject: [PATCH 189/284] [extractor/soundcloud:search] More metadata in `--flat-playlist` (#4965) Authored by: SuperSonicHub1 --- yt_dlp/extractor/soundcloud.py | 338 +++++++++++++++++---------------- 1 file changed, 170 insertions(+), 168 deletions(-) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 2730052a01..228e19c3e9 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -60,6 +60,21 @@ class SoundcloudBaseIE(InfoExtractor): _access_token = None _HEADERS = {} + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } + def _store_client_id(self, client_id): self.cache.store('soundcloud', 'client_id', client_id) @@ -179,6 +194,158 @@ def sign(self, user, pw, clid): return out + def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False): + track_id = compat_str(info['id']) + title = info['title'] + + format_urls = set() + formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token: + query['secret_token'] = secret_token + + if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'): + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'quality': 10, + }) + + def invalid_url(url): + return not url or url in format_urls + + def add_format(f, protocol, is_preview=False): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': protocol, + 'preference': -10 if preview else None, + }) + formats.append(f) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): + continue + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = None if extract_flat else self._download_json( + format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ + 'url': stream_url, + 'ext': ext, + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) + + for f in formats: + f['vcodec'] = 'none' + + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted(metadata_available=True) + self._sort_formats(formats) + + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats if not extract_flat else None + } + @classmethod def _resolv_url(cls, url): return cls._API_V2_BASE + 'resolve?url=' + url @@ -377,173 +544,6 @@ class SoundcloudIE(SoundcloudBaseIE): }, ] - _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' - - _ARTWORK_MAP = { - 'mini': 16, - 'tiny': 20, - 'small': 32, - 'badge': 47, - 't67x67': 67, - 'large': 100, - 't300x300': 300, - 'crop': 400, - 't500x500': 500, - 'original': 0, - } - - def _extract_info_dict(self, info, full_title=None, secret_token=None): - track_id = compat_str(info['id']) - title = info['title'] - - format_urls = set() - formats = [] - query = {'client_id': self._CLIENT_ID} - if secret_token: - query['secret_token'] = secret_token - - if info.get('downloadable') and info.get('has_downloads_left'): - download_url = update_url_query( - self._API_V2_BASE + 'tracks/' + track_id + '/download', query) - redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') - if redirect_url: - urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, fatal=False) - if urlh: - format_url = urlh.geturl() - format_urls.add(format_url) - formats.append({ - 'format_id': 'download', - 'ext': urlhandle_detect_ext(urlh) or 'mp3', - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - 'url': format_url, - 'quality': 10, - }) - - def invalid_url(url): - return not url or url in format_urls - - def add_format(f, protocol, is_preview=False): - mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) - if mobj: - for k, v in mobj.groupdict().items(): - if not f.get(k): - f[k] = v - format_id_list = [] - if protocol: - format_id_list.append(protocol) - ext = f.get('ext') - if ext == 'aac': - f['abr'] = '256' - for k in ('ext', 'abr'): - v = f.get(k) - if v: - format_id_list.append(v) - preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) - if preview: - format_id_list.append('preview') - abr = f.get('abr') - if abr: - f['abr'] = int(abr) - if protocol == 'hls': - protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' - else: - protocol = 'http' - f.update({ - 'format_id': '_'.join(format_id_list), - 'protocol': protocol, - 'preference': -10 if preview else None, - }) - formats.append(f) - - # New API - transcodings = try_get( - info, lambda x: x['media']['transcodings'], list) or [] - for t in transcodings: - if not isinstance(t, dict): - continue - format_url = url_or_none(t.get('url')) - if not format_url: - continue - stream = self._download_json( - format_url, track_id, query=query, fatal=False, headers=self._HEADERS) - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('url')) - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - stream_format = t.get('format') or {} - protocol = stream_format.get('protocol') - if protocol != 'hls' and '/hls' in format_url: - protocol = 'hls' - ext = None - preset = str_or_none(t.get('preset')) - if preset: - ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - ext = mimetype2ext(stream_format.get('mime_type')) - add_format({ - 'url': stream_url, - 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol, - t.get('snipped') or '/preview/' in format_url) - - for f in formats: - f['vcodec'] = 'none' - - if not formats and info.get('policy') == 'BLOCK': - self.raise_geo_restricted(metadata_available=True) - self._sort_formats(formats) - - user = info.get('user') or {} - - thumbnails = [] - artwork_url = info.get('artwork_url') - thumbnail = artwork_url or user.get('avatar_url') - if isinstance(thumbnail, compat_str): - if re.search(self._IMAGE_REPL_RE, thumbnail): - for image_id, size in self._ARTWORK_MAP.items(): - i = { - 'id': image_id, - 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), - } - if image_id == 'tiny' and not artwork_url: - size = 18 - elif image_id == 'original': - i['preference'] = 10 - if size: - i.update({ - 'width': size, - 'height': size, - }) - thumbnails.append(i) - else: - thumbnails = [{'url': thumbnail}] - - def extract_count(key): - return int_or_none(info.get('%s_count' % key)) - - return { - 'id': track_id, - 'uploader': user.get('username'), - 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), - 'uploader_url': user.get('permalink_url'), - 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, - 'description': info.get('description'), - 'thumbnails': thumbnails, - 'duration': float_or_none(info.get('duration'), 1000), - 'webpage_url': info.get('permalink_url'), - 'license': info.get('license'), - 'view_count': extract_count('playback'), - 'like_count': extract_count('favoritings') or extract_count('likes'), - 'comment_count': extract_count('comment'), - 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), - 'formats': formats - } - def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -891,6 +891,7 @@ class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor): _TESTS = [{ 'url': 'scsearch15:post-avant jazzcore', 'info_dict': { + 'id': 'post-avant jazzcore', 'title': 'post-avant jazzcore', }, 'playlist_count': 15, @@ -917,7 +918,8 @@ def _get_collection(self, endpoint, collection_id, **query): for item in response.get('collection') or []: if item: - yield self.url_result(item['uri'], SoundcloudIE.ie_key()) + yield self.url_result( + item['uri'], SoundcloudIE.ie_key(), **self._extract_info_dict(item, extract_flat=True)) next_url = response.get('next_href') if not next_url: From 292fdad2970362743e8f0cf88cbd2d78edbc1fcd Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Tue, 27 Sep 2022 17:27:47 +0200 Subject: [PATCH 190/284] [extractor/dplay:italy] Add default authentication (#5056) Closes #2950 Authored by: Timendum --- yt_dlp/extractor/dplay.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index e16856b2b9..e7629a5e16 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -907,6 +907,9 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): _TESTS = [{ 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/it/video/super-benny/trailer', + 'only_matching': True, }] _PRODUCT = 'dplus_us' @@ -916,6 +919,13 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): 'country': 'it', } + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': 'realm=%s' % realm, + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)' From 9d69c4e4b44077cf9138b0d2c4af7ce199492737 Mon Sep 17 00:00:00 2001 From: Fabi019 <fabi019@gmx.de> Date: Tue, 27 Sep 2022 18:05:31 +0200 Subject: [PATCH 191/284] [extractor/BerufeTV] Add extractor (#4921) Closes #4632 Authored by: Fabi019 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/berufetv.py | 70 +++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 yt_dlp/extractor/berufetv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f334b78330..2321ed2abc 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -172,6 +172,7 @@ from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE from .beatport import BeatportIE +from .berufetv import BerufeTVIE from .bet import BetIE from .bfi import BFIPlayerIE from .bfmtv import ( diff --git a/yt_dlp/extractor/berufetv.py b/yt_dlp/extractor/berufetv.py new file mode 100644 index 0000000000..8160cbd9a7 --- /dev/null +++ b/yt_dlp/extractor/berufetv.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import float_or_none, mimetype2ext, traverse_obj + + +class BerufeTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?web\.arbeitsagentur\.de/berufetv/[^?#]+/film;filmId=(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://web.arbeitsagentur.de/berufetv/studienberufe/wirtschaftswissenschaften/wirtschaftswissenschaften-volkswirtschaft/film;filmId=DvKC3DUpMKvUZ_6fEnfg3u', + 'md5': '041b6432ec8e6838f84a5c30f31cc795', + 'info_dict': { + 'id': 'DvKC3DUpMKvUZ_6fEnfg3u', + 'ext': 'mp4', + 'title': 'Volkswirtschaftslehre', + 'description': 'md5:6bd87d0c63163480a6489a37526ee1c1', + 'categories': ['Studien­beruf'], + 'tags': ['Studienfilm'], + 'duration': 602.440, + 'thumbnail': r're:^https://asset-out-cdn\.video-cdn\.net/private/videos/DvKC3DUpMKvUZ_6fEnfg3u/thumbnails/793063\?quality=thumbnail&__token__=[^\s]+$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + movie_metadata = self._download_json( + 'https://rest.arbeitsagentur.de/infosysbub/berufetv/pc/v1/film-metadata', + video_id, 'Downloading JSON metadata', + headers={'X-API-Key': '79089773-4892-4386-86e6-e8503669f426'}, fatal=False) + + meta = traverse_obj( + movie_metadata, ('metadaten', lambda _, i: video_id == i['miId']), + get_all=False, default={}) + + video = self._download_json( + f'https://d.video-cdn.net/play/player/8YRzUk6pTzmBdrsLe9Y88W/video/{video_id}', + video_id, 'Downloading video JSON') + + formats, subtitles = [], {} + for key, source in video['videoSources']['html'].items(): + if key == 'auto': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source[0]['source'], video_id) + formats += fmts + subtitles = subs + else: + formats.append({ + 'url': source[0]['source'], + 'ext': mimetype2ext(source[0]['mimeType']), + 'format_id': key, + }) + + for track in video.get('videoTracks') or []: + if track.get('type') != 'SUBTITLES': + continue + subtitles.setdefault(track['language'], []).append({ + 'url': track['source'], + 'name': track.get('label'), + 'ext': 'vtt' + }) + + return { + 'id': video_id, + 'title': meta.get('titel') or traverse_obj(video, ('videoMetaData', 'title')), + 'description': meta.get('beschreibung'), + 'thumbnail': meta.get('thumbnail') or f'https://asset-out-cdn.video-cdn.net/private/videos/{video_id}/thumbnails/active', + 'duration': float_or_none(video.get('duration'), scale=1000), + 'categories': [meta['kategorie']] if meta.get('kategorie') else None, + 'tags': meta.get('themengebiete'), + 'subtitles': subtitles, + 'formats': formats, + } From c9eba8075f000fdfab81b3ca11a8816d5835abf7 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Wed, 28 Sep 2022 06:37:12 +0000 Subject: [PATCH 192/284] [extractor/wordpress:playlist] Add generic embed extractor (#5012) Fixes https://github.com/yt-dlp/yt-dlp/issues/4955 Authored by: coletdjnz --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/wordpress.py | 69 +++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 yt_dlp/extractor/wordpress.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2321ed2abc..fa33866df0 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2146,6 +2146,7 @@ WistiaPlaylistIE, WistiaChannelIE, ) +from .wordpress import WordpressPlaylistEmbedIE from .worldstarhiphop import WorldStarHipHopIE from .wppilot import ( WPPilotIE, diff --git a/yt_dlp/extractor/wordpress.py b/yt_dlp/extractor/wordpress.py new file mode 100644 index 0000000000..e90ae6c1e1 --- /dev/null +++ b/yt_dlp/extractor/wordpress.py @@ -0,0 +1,69 @@ +from .common import InfoExtractor +from ..utils import ( + get_elements_by_class, + int_or_none, + parse_duration, + traverse_obj, +) + + +# https://codex.wordpress.org/Playlist_Shortcode +class WordpressPlaylistEmbedIE(InfoExtractor): + _VALID_URL = False + IE_NAME = 'wordpress:playlist' + _WEBPAGE_TESTS = [{ + # 5 WordPress playlists. This is using wpse-playlist, which is similar. + # See: https://github.com/birgire/wpse-playlist + 'url': 'https://xlino.com/wordpress-playlist-shortcode-with-external-audio-or-video-files/', + 'info_dict': { + 'id': 'wordpress-playlist-shortcode-with-external-audio-or-video-files', + 'title': 'WordPress: Playlist shortcode with external audio or video files – Birgir Erlendsson (birgire)', + 'age_limit': 0, + }, + 'playlist_count': 5, + }, { + 'url': 'https://pianoadventures.com/products/piano-adventures-level-1-lesson-book-enhanced-cd/', + 'info_dict': { + 'id': 'piano-adventures-level-1-lesson-book-enhanced-cd-wp-playlist-1', + 'title': 'Wordpress Playlist', + 'thumbnail': 'https://pianoadventures.com/wp-content/uploads/sites/13/2022/01/CD1002cover.jpg', + 'age_limit': 0, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'CD1002-21', + 'ext': 'mp3', + 'title': '21 Half-Time Show', + 'thumbnail': 'https://pianoadventures.com/wp-content/plugins/media-library-assistant/images/crystal/audio.png', + 'album': 'Piano Adventures Level 1 Lesson Book (2nd Edition)', + 'genre': 'Classical', + 'duration': 49.0, + 'artist': 'Nancy and Randall Faber', + 'description': 'md5:a9f8e9aeabbd2912bc13cc0fab1a4ce8', + } + }], + 'playlist_count': 6, + 'params': {'skip_download': True} + }] + + def _extract_from_webpage(self, url, webpage): + # class should always be "wp-playlist-script" + # See: https://core.trac.wordpress.org/browser/trunk/src/wp-includes/media.php#L2930 + for i, j in enumerate(get_elements_by_class('wp-playlist-script', webpage)): + playlist_json = self._parse_json(j, self._generic_id(url), fatal=False, ignore_extra=True, errnote='') or {} + if not playlist_json: + continue + entries = [{ + 'id': self._generic_id(track['src']), + 'title': track.get('title'), + 'url': track.get('src'), + 'thumbnail': traverse_obj(track, ('thumb', 'src')), + 'album': traverse_obj(track, ('meta', 'album')), + 'artist': traverse_obj(track, ('meta', 'artist')), + 'genre': traverse_obj(track, ('meta', 'genre')), + 'duration': parse_duration(traverse_obj(track, ('meta', 'length_formatted'))), + 'description': track.get('description'), + 'height': int_or_none(traverse_obj(track, ('dimensions', 'original', 'height'))), + 'width': int_or_none(traverse_obj(track, ('dimensions', 'original', 'width'))), + } for track in traverse_obj(playlist_json, ('tracks', ...), expected_type=dict)] + yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i+1}', 'Wordpress Playlist') From 10e2eb4f81d3c9ef14d59a775428bbef96f22709 Mon Sep 17 00:00:00 2001 From: Julien Hadley Jack <github@jlhj.de> Date: Wed, 28 Sep 2022 11:04:03 +0200 Subject: [PATCH 193/284] [extractor/ondemandkorea] Update `jw_config` regex (#5040) Authored by: julien-hadleyjack --- yt_dlp/extractor/ondemandkorea.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index 84687ef473..dd7d1d7dea 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -11,11 +11,11 @@ class OnDemandKoreaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' _GEO_COUNTRIES = ['US', 'CA'] _TESTS = [{ - 'url': 'https://www.ondemandkorea.com/ask-us-anything-e43.html', + 'url': 'https://www.ondemandkorea.com/ask-us-anything-e351.html', 'info_dict': { - 'id': 'ask-us-anything-e43', + 'id': 'ask-us-anything-e351', 'ext': 'mp4', - 'title': 'Ask Us Anything : Gain, Ji Soo - 09/24/2016', + 'title': 'Ask Us Anything : Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won - 09/24/2022', 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”', 'thumbnail': r're:^https?://.*\.jpg$', }, @@ -23,13 +23,13 @@ class OnDemandKoreaIE(InfoExtractor): 'skip_download': 'm3u8 download' } }, { - 'url': 'https://www.ondemandkorea.com/confession-e01-1.html', + 'url': 'https://www.ondemandkorea.com/work-later-drink-now-e1.html', 'info_dict': { - 'id': 'confession-e01-1', + 'id': 'work-later-drink-now-e1', 'ext': 'mp4', - 'title': 'Confession : E01', - 'description': 'Choi Do-hyun, a criminal attorney, is the son of a death row convict. Ever since Choi Pil-su got arrested for murder, Do-hyun has wanted to solve his ', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Work Later, Drink Now : E01', + 'description': 'Work Later, Drink First follows three women who find solace in a glass of liquor at the end of the day. So-hee, who gets comfort from a cup of soju af', + 'thumbnail': r're:^https?://.*\.png$', 'subtitles': { 'English': 'mincount:1', }, @@ -69,9 +69,11 @@ def _real_extract(self, url): webpage, 'episode_title', fatal=False) or self._og_search_title(webpage) jw_config = self._parse_json( - self._search_regex( + self._search_regex(( + r'(?P<options>{\s*[\'"]tracks[\'"].*?})[)\];]+$', r'playlist\s*=\s*\[(?P<options>.+)];?$', - webpage, 'jw config', flags=re.MULTILINE, group='options'), + r'odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', + ), webpage, 'jw config', flags=re.MULTILINE | re.DOTALL, group='options'), video_id, transform_source=js_to_json) info = self._parse_jwplayer_data( jw_config, video_id, require_title=False, m3u8_id='hls', From a5642f2c4a212488ef4d103ae54ed01f6040adf2 Mon Sep 17 00:00:00 2001 From: Anant Murmu <freezboltz@gmail.com> Date: Thu, 29 Sep 2022 08:31:43 +0530 Subject: [PATCH 194/284] [extractor/zee5] Generate device ids (#5062) Closes #4937 Authored by: freezboltz --- yt_dlp/extractor/zee5.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index 29c6d04e69..d0229e78b5 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -1,4 +1,6 @@ import json +import random +import string from .common import InfoExtractor from ..compat import compat_str @@ -84,7 +86,7 @@ class Zee5IE(InfoExtractor): 'only_matching': True }] _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' - _DEVICE_ID = '1q70TH8Wz0wTyw4buVgg000000000000' + _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') _USER_TOKEN = None _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' _NETRC_MACHINE = 'zee5' From f1aae715684b8a2cd4ce5590373b49ba5030dba6 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Fri, 30 Sep 2022 14:02:20 +1300 Subject: [PATCH 195/284] [extractor/rcs] Fix embed extraction Fixes https://github.com/yt-dlp/yt-dlp/issues/5076 Authored by: coletdjnz --- yt_dlp/extractor/rcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py index 28ba42eedc..e6185fec75 100644 --- a/yt_dlp/extractor/rcs.py +++ b/yt_dlp/extractor/rcs.py @@ -337,7 +337,7 @@ def _sanitize_urls(urls): @classmethod def _extract_embed_urls(cls, url, webpage): - return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) + return cls._sanitize_urls(list(super()._extract_embed_urls(url, webpage))) class RCSIE(RCSBaseIE): From dfea94f8f69a8cd06b4781e95a0cd23fb06e6d67 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Fri, 30 Sep 2022 03:05:44 +0200 Subject: [PATCH 196/284] [extractor/crunchyroll:beta] Improve handling of hardsubs (#5041) Closes #3397 Authored by: Grub4K --- README.md | 2 +- yt_dlp/YoutubeDL.py | 4 +- yt_dlp/extractor/crunchyroll.py | 73 +++++++++++++++++++++++++-------- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 07ed04061c..76c73398e3 100644 --- a/README.md +++ b/README.md @@ -1722,7 +1722,7 @@ #### crunchyroll #### crunchyrollbeta * `format`: Which stream type(s) to extract (default: `adaptive_hls`). Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `download_dash`, `multitrack_adaptive_hls_v2` -* `hardsub`: Preference order for which hardsub versions to extract (default: `None` = no hardsubs), e.g. `crunchyrollbeta:hardsub=en-US,None` +* `hardsub`: Preference order for which hardsub versions to extract, or `all` (default: `None` = no hardsubs), e.g. `crunchyrollbeta:hardsub=en-US,None` #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 7b0616cba8..4fcf1f5cc7 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -846,7 +846,7 @@ def to_stdout(self, message, skip_eol=False, quiet=None): 'Use "YoutubeDL.to_screen" instead') self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out) - def to_screen(self, message, skip_eol=False, quiet=None): + def to_screen(self, message, skip_eol=False, quiet=None, only_once=False): """Print message to screen if not in quiet mode""" if self.params.get('logger'): self.params['logger'].debug(message) @@ -855,7 +855,7 @@ def to_screen(self, message, skip_eol=False, quiet=None): return self._write_string( '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files.screen) + self._out_files.screen, only_once=only_once) def to_stderr(self, message, only_once=False): """Print message to stderr""" diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 141d8c5a7c..4f209e6705 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -779,7 +779,28 @@ class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): 'episode_number': 73, 'thumbnail': r're:^https://beta.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYE5WKQGR', + 'info_dict': { + 'id': 'GYE5WKQGR', + 'ext': 'mp4', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://beta.crunchyroll.com/imgsrv/.*\.jpeg$', + }, + 'params': {'skip_download': True}, + 'skip': 'Video is Premium only', }, { 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y', 'only_matching': True, @@ -807,30 +828,48 @@ def _real_extract(self, url): hardsub_preference = qualities(requested_hardsubs[::-1]) requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - formats = [] + available_formats = {} for stream_type, streams in get_streams('streams'): if stream_type not in requested_formats: continue for stream in streams.values(): - hardsub_lang = stream.get('hardsub_locale') or '' - if hardsub_lang.lower() not in requested_hardsubs: - continue - format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) if not stream.get('url'): continue - if stream_type.endswith('hls'): + hardsub_lang = stream.get('hardsub_locale') or '' + format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) + available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + + if '' in available_formats and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen( + 'To get all formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' + 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + formats = [] + for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): + if stream_type.endswith('hls'): + if hardsub_lang.lower() in full_format_langs: adaptive_formats = self._extract_m3u8_formats( - stream['url'], display_id, 'mp4', m3u8_id=format_id, + stream_url, display_id, 'mp4', m3u8_id=format_id, fatal=False, note=f'Downloading {format_id} HLS manifest') - elif stream_type.endswith('dash'): - adaptive_formats = self._extract_mpd_formats( - stream['url'], display_id, mpd_id=format_id, - fatal=False, note=f'Downloading {format_id} MPD manifest') - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) + else: + adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) + elif stream_type.endswith('dash'): + adaptive_formats = self._extract_mpd_formats( + stream_url, display_id, mpd_id=format_id, + fatal=False, note=f'Downloading {format_id} MPD manifest') + else: + self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) + continue + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) self._sort_formats(formats) return { From 11398b922c0469e4143f72951d3c9c55587ef39d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 30 Sep 2022 15:43:40 +0000 Subject: [PATCH 197/284] [extractor/nbc] Add NBCStations extractor (#5077) Closes #4571 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/nbc.py | 172 ++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index fa33866df0..76cba4ba22 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1079,6 +1079,7 @@ NBCSportsIE, NBCSportsStreamIE, NBCSportsVPlayerIE, + NBCStationsIE, ) from .ndr import ( NDRIE, diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 910cbedf67..6b482620a7 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -7,14 +7,20 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, parse_duration, RegexNotFoundError, smuggle_url, + str_or_none, + traverse_obj, try_get, + unified_strdate, unified_timestamp, update_url_query, + url_basename, + variadic, ) @@ -584,3 +590,169 @@ def _real_extract(self, url): 'formats': formats, 'is_live': is_live, } + + +class NBCStationsIE(InfoExtractor): + _DOMAIN_RE = '|'.join(map(re.escape, ( + 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles', + 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington', + 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra', + ))) + _VALID_URL = rf'https?://(?:www\.)?(?P<site>{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P<id>[^/?#]+)/?(?:$|[#?])' + + _TESTS = [{ + 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', + 'md5': '462041d91bd762ef5a38b7d85d6dc18f', + 'info_dict': { + 'id': '2968618', + 'ext': 'mp4', + 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', + 'description': None, + 'timestamp': 1661135892, + 'upload_date': '20220821', + 'uploader': 'NBC 4', + 'uploader_id': 'KNBC', + 'channel': 'nbclosangeles', + }, + }, { + 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', + 'md5': '0917dcf7885be1023a9220630d415f67', + 'info_dict': { + 'id': '2247002', + 'ext': 'mp4', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'timestamp': 1660886507, + 'upload_date': '20220819', + 'uploader': 'Telemundo Arizona', + 'uploader_id': 'KTAZ', + 'channel': 'telemundoarizona', + }, + }] + + _RESOLUTIONS = { + '1080': '1920', + '720': '1280', + '540': '960', + '360': '640', + '234': '416', + } + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('site', 'id') + webpage = self._download_webpage(url, video_id) + + nbc_data = self._search_json( + r'<script>var\s*nbc\s*=\s*', webpage, 'NBC JSON data', video_id) + pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' + fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) + fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') + + video_data = self._parse_json(self._html_search_regex( + r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id) + video_data = variadic(video_data)[0] + video_data.update(self._parse_json(self._html_search_regex( + r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id)) + + formats = [] + + if video_data.get('mpx_is_livestream') == '1': + live = True + player_id = traverse_obj( + video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid', + ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium') + query = { + 'mbr': 'true', + 'assetTypes': 'LegacyRelease', + 'fwsitesection': fw_ssid, + 'fwNetworkID': fw_network_id, + 'pprofile': 'ots_desktop_html', + 'sensitive': 'false', + 'w': '1920', + 'h': '1080', + 'rnd': '1660303', + 'mode': 'LIVE', + 'format': 'SMIL', + 'tracking': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + 'vpaid': 'script', + 'schema': '2.0', + 'SDK': 'PDK+6.1.3', + } + info = { + 'title': f'{channel} livestream', + } + + else: + live = False + player_id = traverse_obj( + video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high', + ('video', 'meta', 'mpx_pid'), 'mpx_pid') + + date_string = traverse_obj(video_data, 'date_string', 'date_gmt') + if date_string: + date_string = self._search_regex( + r'datetime="([^"]+)"', date_string, 'date string', fatal=False) + else: + date_string = traverse_obj( + nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'), + ('dataLayer', 'adobe', 'eVar59')) + + video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url') + if video_url: + height = url_basename(video_url).split('-')[1].split('p')[0] + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'width': int_or_none(self._RESOLUTIONS.get(height)), + 'height': int_or_none(height), + 'format_id': f'http-{height}', + }) + + query = { + 'mbr': 'true', + 'assetTypes': 'LegacyRelease', + 'fwsitesection': fw_ssid, + 'fwNetworkID': fw_network_id, + 'format': 'redirect', + 'manifest': 'm3u', + 'Tracking': 'true', + 'Embedded': 'true', + 'formats': 'MPEG4', + } + info = { + 'title': video_data.get('title') or traverse_obj( + nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'), + ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')), + 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'), + 'upload_date': str_or_none(unified_strdate(date_string)), + 'timestamp': int_or_none(unified_timestamp(date_string)), + } + + if not player_id: + raise ExtractorError( + 'No video player ID or livestream player ID found in webpage', expected=True) + + headers = {'Origin': f'https://www.{channel}.com'} + manifest, urlh = self._download_webpage_handle( + f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, + headers=headers, query=query, note='Downloading manifest') + if live: + manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL') + else: + manifest_url = urlh.geturl() + + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls', + fatal=live, live=live, errnote='No HLS formats found')) + self._sort_formats(formats) + + return { + 'id': str_or_none(video_id), + 'channel': channel, + 'uploader': str_or_none(nbc_data.get('on_air_name')), + 'uploader_id': str_or_none(nbc_data.get('callLetters')), + 'formats': formats, + 'is_live': live, + **info, + } From 8dbad2a4394ed68a2d6d48f6b4b2f7176a30906c Mon Sep 17 00:00:00 2001 From: Teemu Ikonen <tpikonen@gmail.com> Date: Fri, 30 Sep 2022 19:14:14 +0300 Subject: [PATCH 198/284] [extractor/audioboom] Support direct URLs and refactor (#4803) Authored by: tpikonen, pukkandan --- yt_dlp/extractor/audioboom.py | 70 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/yt_dlp/extractor/audioboom.py b/yt_dlp/extractor/audioboom.py index dc19a3874b..f1aa0201b6 100644 --- a/yt_dlp/extractor/audioboom.py +++ b/yt_dlp/extractor/audioboom.py @@ -2,6 +2,8 @@ from ..utils import ( clean_html, float_or_none, + unescapeHTML, + traverse_obj, ) @@ -9,16 +11,28 @@ class AudioBoomIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', - 'md5': '7b00192e593ff227e6a315486979a42d', + 'md5': '4d68be11c9f9daf3dab0778ad1e010c3', 'info_dict': { 'id': '7398103', 'ext': 'mp3', 'title': 'Asim Chaudhry', - 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'description': 'md5:0ed714ae0e81e5d9119cac2f618ad679', 'duration': 4000.99, 'uploader': 'Sue Perkins: An hour or so with...', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', } + }, { # Direct mp3-file link + 'url': 'https://audioboom.com/posts/8128496.mp3', + 'md5': 'e329edf304d450def95c7f86a9165ee1', + 'info_dict': { + 'id': '8128496', + 'ext': 'mp3', + 'title': 'TCRNo8 / DAILY 03 - In Control', + 'description': 'md5:44665f142db74858dfa21c5b34787948', + 'duration': 1689.7, + 'uploader': 'Lost Dot Podcast: The Trans Pyrenees and Transcontinental Race', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channels/5003904', + } }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', 'only_matching': True, @@ -26,45 +40,23 @@ class AudioBoomIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(f'https://audioboom.com/posts/{video_id}', video_id) - webpage = self._download_webpage(url, video_id) - - clip = None - - clip_store = self._parse_json( - self._html_search_regex( - r'data-new-clip-store=(["\'])(?P<json>{.+?})\1', - webpage, 'clip store', default='{}', group='json'), - video_id, fatal=False) - if clip_store: - clips = clip_store.get('clips') - if clips and isinstance(clips, list) and isinstance(clips[0], dict): - clip = clips[0] - - def from_clip(field): - if clip: - return clip.get(field) - - audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( - 'audio', webpage, 'audio url') - title = from_clip('title') or self._html_search_meta( - ['og:title', 'og:audio:title', 'audio_title'], webpage) - description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) - - duration = float_or_none(from_clip('duration') or self._html_search_meta( - 'weibo:audio:duration', webpage)) - - uploader = from_clip('author') or self._html_search_meta( - ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') - uploader_url = from_clip('author_url') or self._html_search_meta( - 'audioboo:channel', webpage, 'uploader url') + clip_store = self._search_json( + r'data-react-class="V5DetailPagePlayer"\s*data-react-props=["\']', + webpage, 'clip store', video_id, fatal=False, transform_source=unescapeHTML) + clip = traverse_obj(clip_store, ('clips', 0), expected_type=dict) or {} return { 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_url': uploader_url, + 'url': clip.get('clipURLPriorToLoading') or self._og_search_property('audio', webpage, 'audio url'), + 'title': clip.get('title') or self._html_search_meta(['og:title', 'og:audio:title', 'audio_title'], webpage), + 'description': (clip.get('description') or clean_html(clip.get('formattedDescription')) + or self._og_search_description(webpage)), + 'duration': float_or_none(clip.get('duration') or self._html_search_meta('weibo:audio:duration', webpage)), + 'uploader': clip.get('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader'), + 'uploader_url': clip.get('author_url') or self._html_search_regex( + r'<div class="avatar flex-shrink-0">\s*<a href="(?P<uploader_url>http[^"]+)"', + webpage, 'uploader url', fatal=False), } From 48f535f5f8de109cdfb20eef8beed73e65cdfdd4 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Fri, 30 Sep 2022 11:21:31 -0500 Subject: [PATCH 199/284] [extractor/tencent] Add Iflix extractor (#4829) Closes #4823 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/tencent.py | 137 +++++++++++++++++++++++++------- 2 files changed, 110 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 76cba4ba22..d8fe74413d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1766,6 +1766,8 @@ from .telewebion import TelewebionIE from .tempo import TempoIE from .tencent import ( + IflixEpisodeIE, + IflixSeriesIE, VQQSeriesIE, VQQVideoIE, WeTvEpisodeIE, diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py index c755407d3c..44cd196009 100644 --- a/yt_dlp/extractor/tencent.py +++ b/yt_dlp/extractor/tencent.py @@ -262,6 +262,41 @@ def _get_webpage_metadata(self, webpage, video_id): traverse_obj(self._search_nextjs_data(webpage, video_id), ('props', 'pageProps', 'data')), video_id, fatal=False) + def _extract_episode(self, url): + video_id, series_id = self._match_valid_url(url).group('id', 'series_id') + webpage = self._download_webpage(url, video_id) + webpage_metadata = self._get_webpage_metadata(webpage, video_id) + + formats, subtitles = self._extract_all_video_formats_and_subtitles(url, video_id, series_id) + return { + 'id': video_id, + 'title': self._get_clean_title(self._og_search_title(webpage) + or traverse_obj(webpage_metadata, ('coverInfo', 'title'))), + 'description': (traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'duration'))), + 'series': traverse_obj(webpage_metadata, ('coverInfo', 'title')), + 'episode_number': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'episode'))), + } + + def _extract_series(self, url, ie): + series_id = self._match_id(url) + webpage = self._download_webpage(url, series_id) + webpage_metadata = self._get_webpage_metadata(webpage, series_id) + + episode_paths = ([f'/play/{series_id}/{episode["vid"]}' for episode in webpage_metadata.get('videoList')] + or re.findall(r'<a[^>]+class="play-video__link"[^>]+href="(?P<path>[^"]+)', webpage)) + + return self.playlist_from_matches( + episode_paths, series_id, ie=ie, getter=functools.partial(urljoin, url), + title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) + or self._og_search_title(webpage)), + description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage))) + class WeTvEpisodeIE(WeTvBaseIE): IE_NAME = 'wetv:episode' @@ -312,24 +347,7 @@ class WeTvEpisodeIE(WeTvBaseIE): }] def _real_extract(self, url): - video_id, series_id = self._match_valid_url(url).group('id', 'series_id') - webpage = self._download_webpage(url, video_id) - webpage_metadata = self._get_webpage_metadata(webpage, video_id) - - formats, subtitles = self._extract_all_video_formats_and_subtitles(url, video_id, series_id) - return { - 'id': video_id, - 'title': self._get_clean_title(self._og_search_title(webpage) - or traverse_obj(webpage_metadata, ('coverInfo', 'title'))), - 'description': (traverse_obj(webpage_metadata, ('coverInfo', 'description')) - or self._og_search_description(webpage)), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnail': self._og_search_thumbnail(webpage), - 'duration': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'duration'))), - 'series': traverse_obj(webpage_metadata, ('coverInfo', 'title')), - 'episode_number': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'episode'))), - } + return self._extract_episode(url) class WeTvSeriesIE(WeTvBaseIE): @@ -354,16 +372,77 @@ class WeTvSeriesIE(WeTvBaseIE): }] def _real_extract(self, url): - series_id = self._match_id(url) - webpage = self._download_webpage(url, series_id) - webpage_metadata = self._get_webpage_metadata(webpage, series_id) + return self._extract_series(url, WeTvEpisodeIE) - episode_paths = ([f'/play/{series_id}/{episode["vid"]}' for episode in webpage_metadata.get('videoList')] - or re.findall(r'<a[^>]+class="play-video__link"[^>]+href="(?P<path>[^"]+)', webpage)) - return self.playlist_from_matches( - episode_paths, series_id, ie=WeTvEpisodeIE, getter=functools.partial(urljoin, url), - title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) - or self._og_search_title(webpage)), - description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) - or self._og_search_description(webpage))) +class IflixBaseIE(WeTvBaseIE): + _VALID_URL_BASE = r'https?://(?:www\.)?iflix\.com/(?:[^?#]+/)?play' + + _API_URL = 'https://vplay.iflix.com/getvinfo' + _APP_VERSION = '3.5.57' + _PLATFORM = '330201' + _HOST = 'www.iflix.com' + _REFERER = 'www.iflix.com' + + +class IflixEpisodeIE(IflixBaseIE): + IE_NAME = 'iflix:episode' + _VALID_URL = IflixBaseIE._VALID_URL_BASE + r'/(?P<series_id>\w+)(?:-[^?#]+)?/(?P<id>\w+)(?:-[^?#]+)?' + + _TESTS = [{ + 'url': 'https://www.iflix.com/en/play/daijrxu03yypu0s/a0040kvgaza', + 'md5': '9740f9338c3a2105290d16b68fb3262f', + 'info_dict': { + 'id': 'a0040kvgaza', + 'ext': 'mp4', + 'title': 'EP1: Put Your Head On My Shoulder 2021', + 'description': 'md5:c095a742d3b7da6dfedd0c8170727a42', + 'thumbnail': r're:^https?://[^?#]+daijrxu03yypu0s', + 'series': 'Put Your Head On My Shoulder 2021', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 2639, + }, + }, { + 'url': 'https://www.iflix.com/en/play/fvvrcc3ra9lbtt1-Take-My-Brother-Away/i0029sd3gm1-EP1%EF%BC%9ATake-My-Brother-Away', + 'md5': '375c9b8478fdedca062274b2c2f53681', + 'info_dict': { + 'id': 'i0029sd3gm1', + 'ext': 'mp4', + 'title': 'EP1:Take My Brother Away', + 'description': 'md5:f0f7be1606af51cd94d5627de96b0c76', + 'thumbnail': r're:^https?://[^?#]+fvvrcc3ra9lbtt1', + 'series': 'Take My Brother Away', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 228, + }, + }] + + def _real_extract(self, url): + return self._extract_episode(url) + + +class IflixSeriesIE(IflixBaseIE): + _VALID_URL = IflixBaseIE._VALID_URL_BASE + r'/(?P<id>\w+)(?:-[^/?#]+)?/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://www.iflix.com/en/play/g21a6qk4u1s9x22-You-Are-My-Hero', + 'info_dict': { + 'id': 'g21a6qk4u1s9x22', + 'title': 'You Are My Hero', + 'description': 'md5:9c4d844bc0799cd3d2b5aed758a2050a', + }, + 'playlist_count': 40, + }, { + 'url': 'https://www.iflix.com/play/0s682hc45t0ohll', + 'info_dict': { + 'id': '0s682hc45t0ohll', + 'title': 'Miss Gu Who Is Silent', + 'description': 'md5:a9651d0236f25af06435e845fa2f8c78', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + return self._extract_series(url, IflixEpisodeIE) From 9cc5aed990e6f3baa1eff3d7e040eef197a166de Mon Sep 17 00:00:00 2001 From: Mehavoid <63477090+Mehavoid@users.noreply.github.com> Date: Fri, 30 Sep 2022 19:39:08 +0300 Subject: [PATCH 200/284] [extractor/trovo] Fix extractors (#4880) Authored by: Mehavoid Closes #4878 --- yt_dlp/extractor/trovo.py | 133 +++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 75 deletions(-) diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py index c8816f7bc2..f4d4bcd174 100644 --- a/yt_dlp/extractor/trovo.py +++ b/yt_dlp/extractor/trovo.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, format_field, + traverse_obj, int_or_none, str_or_none, try_get, @@ -26,7 +27,7 @@ def _call_api(self, video_id, data): resp = self._download_json( url, video_id, data=json.dumps([data]).encode(), headers={'Accept': 'application/json'}, query={ - 'qid': ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)), + 'qid': ''.join(random.choices(string.ascii_uppercase + string.digits, k=16)), })[0] if 'errors' in resp: raise ExtractorError(f'Trovo said: {resp["errors"][0]["message"]}') @@ -146,7 +147,26 @@ class TrovoVodIE(TrovoBaseIE): 'upload_date': '20220611', 'comment_count': int, 'categories': ['Minecraft'], - } + }, + 'skip': 'Not available', + }, { + 'url': 'https://trovo.live/s/Trovo/549756886599?vid=ltv-100264059_100264059_387702304241698583', + 'info_dict': { + 'id': 'ltv-100264059_100264059_387702304241698583', + 'ext': 'mp4', + 'timestamp': 1661479563, + 'thumbnail': 'http://vod.trovo.live/be5ae591vodtransusw1301120758/cccb9915387702304241698583/coverBySnapshot/coverBySnapshot_10_0.jpg', + 'uploader_id': '100264059', + 'uploader': 'Trovo', + 'title': 'Dev Corner 8/25', + 'uploader_url': 'https://trovo.live/Trovo', + 'duration': 3753, + 'view_count': int, + 'like_count': int, + 'upload_date': '20220826', + 'comment_count': int, + 'categories': ['Talk Shows'], + }, }, { 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', 'only_matching': True, @@ -162,22 +182,20 @@ def _real_extract(self, url): # however that seems unreliable - sometimes it randomly doesn't return the data, # at least when using a non-residential IP. resp = self._call_api(vid, data={ - 'operationName': 'batchGetVodDetailInfo', + 'operationName': 'vod_VodReaderService_BatchGetVodDetailInfo', 'variables': { 'params': { 'vids': [vid], }, }, - 'extensions': { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': 'ceae0355d66476e21a1dd8e8af9f68de95b4019da2cda8b177c9a2255dad31d0', - }, - }, + 'extensions': {}, }) - vod_detail_info = resp['VodDetailInfos'][vid] - vod_info = vod_detail_info['vodInfo'] - title = vod_info['title'] + + vod_detail_info = traverse_obj(resp, ('VodDetailInfos', vid), expected_type=dict) + if not vod_detail_info: + raise ExtractorError('This video not found or not available anymore', expected=True) + vod_info = vod_detail_info.get('vodInfo') + title = vod_info.get('title') if try_get(vod_info, lambda x: x['playbackRights']['playbackRights'] != 'Normal'): playback_rights_setting = vod_info['playbackRights']['playbackRightsSetting'] @@ -228,7 +246,7 @@ def _real_extract(self, url): def _get_comments(self, vid): for page in itertools.count(1): comments_json = self._call_api(vid, data={ - 'operationName': 'getCommentList', + 'operationName': 'public_CommentProxyService_GetCommentList', 'variables': { 'params': { 'appInfo': { @@ -240,10 +258,7 @@ def _get_comments(self, vid): }, }, 'extensions': { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': 'be8e5f9522ddac7f7c604c0d284fd22481813263580849926c4c66fb767eed25', - }, + 'singleReq': 'true', }, }) for comment in comments_json['commentList']: @@ -266,33 +281,37 @@ def _get_comments(self, vid): class TrovoChannelBaseIE(TrovoBaseIE): - def _get_vod_json(self, page, uid): - raise NotImplementedError('This method must be implemented by subclasses') - - def _entries(self, uid): + def _entries(self, spacename): for page in itertools.count(1): - vod_json = self._get_vod_json(page, uid) + vod_json = self._call_api(spacename, data={ + 'operationName': self._OPERATION, + 'variables': { + 'params': { + 'terminalSpaceID': { + 'spaceName': spacename, + }, + 'currPage': page, + 'pageSize': 99, + }, + }, + 'extensions': { + 'singleReq': 'true', + }, + }) vods = vod_json.get('vodInfos', []) for vod in vods: + vid = vod.get('vid') + room = traverse_obj(vod, ('spaceInfo', 'roomID')) yield self.url_result( - 'https://trovo.live/%s/%s' % (self._TYPE, vod.get('vid')), + f'https://trovo.live/s/{spacename}/{room}?vid={vid}', ie=TrovoVodIE.ie_key()) - has_more = vod_json['hasMore'] + has_more = vod_json.get('hasMore') if not has_more: break def _real_extract(self, url): - id = self._match_id(url) - live_info = self._call_api(id, data={ - 'operationName': 'live_LiveReaderService_GetLiveInfo', - 'variables': { - 'params': { - 'userName': id, - }, - }, - }) - uid = str(live_info['streamerInfo']['uid']) - return self.playlist_result(self._entries(uid), playlist_id=uid) + spacename = self._match_id(url) + return self.playlist_result(self._entries(spacename), playlist_id=spacename) class TrovoChannelVodIE(TrovoChannelBaseIE): @@ -303,29 +322,11 @@ class TrovoChannelVodIE(TrovoChannelBaseIE): 'url': 'trovovod:OneTappedYou', 'playlist_mincount': 24, 'info_dict': { - 'id': '100719456', + 'id': 'OneTappedYou', }, }] - _TYPE = 'video' - - def _get_vod_json(self, page, uid): - return self._call_api(uid, data={ - 'operationName': 'getChannelLtvVideoInfos', - 'variables': { - 'params': { - 'channelID': int(uid), - 'pageSize': 99, - 'currPage': page, - }, - }, - 'extensions': { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': '78fe32792005eab7e922cafcdad9c56bed8bbc5f5df3c7cd24fcb84a744f5f78', - }, - }, - }) + _OPERATION = 'vod_VodReaderService_GetChannelLtvVideoInfos' class TrovoChannelClipIE(TrovoChannelBaseIE): @@ -336,26 +337,8 @@ class TrovoChannelClipIE(TrovoChannelBaseIE): 'url': 'trovoclip:OneTappedYou', 'playlist_mincount': 29, 'info_dict': { - 'id': '100719456', + 'id': 'OneTappedYou', }, }] - _TYPE = 'clip' - - def _get_vod_json(self, page, uid): - return self._call_api(uid, data={ - 'operationName': 'getChannelClipVideoInfos', - 'variables': { - 'params': { - 'channelID': int(uid), - 'pageSize': 99, - 'currPage': page, - }, - }, - 'extensions': { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': 'e7924bfe20059b5c75fc8ff9e7929f43635681a7bdf3befa01072ed22c8eff31', - }, - }, - }) + _OPERATION = 'vod_VodReaderService_GetChannelClipVideoInfos' From 7e378287c4502d82aedb5272b8e9d5f6c1681fad Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Sat, 1 Oct 2022 01:40:33 +0900 Subject: [PATCH 201/284] [extractor/malltv] Fix video_id extraction (#4883) Closes #4870 Authored by: HobbyistDev --- yt_dlp/extractor/malltv.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/malltv.py b/yt_dlp/extractor/malltv.py index bfd6008b3a..02f226be57 100644 --- a/yt_dlp/extractor/malltv.py +++ b/yt_dlp/extractor/malltv.py @@ -14,7 +14,7 @@ class MallTVIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'md5': '1c4a37f080e1f3023103a7b43458e518', + 'md5': 'cd69ce29176f6533b65bff69ed9a5f2a', 'info_dict': { 'id': 't0zzt0', 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', @@ -25,6 +25,11 @@ class MallTVIE(InfoExtractor): 'timestamp': 1538870400, 'upload_date': '20181007', 'view_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnigfq/thumbnails/retina.jpg', + 'average_rating': 9.060869565217391, + 'dislike_count': int, + 'like_count': int, } }, { 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', @@ -32,6 +37,24 @@ class MallTVIE(InfoExtractor): }, { 'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka', 'only_matching': True, + }, { + 'url': 'https://www.mall.tv/zivoty-slavnych/nadeje-vychodu-i-zapadu-jak-michail-gorbacov-zmenil-politickou-mapu-sveta-a-ziskal-za-to-nobelovu-cenu-miru', + 'info_dict': { + 'id': 'yx010y', + 'ext': 'mp4', + 'dislike_count': int, + 'description': 'md5:aee02bee5a8d072c6a8207b91d1905a9', + 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnjdeu/thumbnails/retina.jpg', + 'comment_count': int, + 'display_id': 'md5:0ec2afa94d2e2b7091c019cef2a43a9b', + 'like_count': int, + 'duration': 752, + 'timestamp': 1646956800, + 'title': 'md5:fe79385daaf16d74c12c1ec4a26687af', + 'view_count': int, + 'upload_date': '20220311', + 'average_rating': 9.685714285714285, + } }] def _real_extract(self, url): @@ -43,12 +66,12 @@ def _real_extract(self, url): video = self._parse_json(self._search_regex( r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', webpage, 'video object'), display_id) - video_source = video['VideoSource'] + video_id = self._search_regex( - r'/([\da-z]+)/index\b', video_source, 'video id') + r'<input\s*id\s*=\s*player-id-name\s*[^>]+value\s*=\s*(\w+)', webpage, 'video id') formats = self._extract_m3u8_formats( - video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + video['VideoSource'], video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) subtitles = {} @@ -69,7 +92,7 @@ def get_count(k): info = self._search_json_ld(webpage, video_id, default={}) return merge_dicts({ - 'id': video_id, + 'id': str(video_id), 'display_id': display_id, 'title': video.get('Title'), 'description': clean_html(video.get('Description')), From 2e0f8d4f6e4dd546044c9432ec6aa223f67178bb Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Fri, 30 Sep 2022 18:52:52 +0200 Subject: [PATCH 202/284] [extractor/yandexvideopreview] Update _VALID_URL (#5084) Closes #5065 Authored by: Grub4K --- yt_dlp/extractor/yandexvideo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index 37ff514b3a..eadb1aaeeb 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -147,7 +147,7 @@ def _real_extract(self, url): class YandexVideoPreviewIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?yandex\.ru/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?yandex\.\w{2,3}(?:\.(?:am|ge|il|tr))?/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)' _TESTS = [{ # Odnoklassniki 'url': 'https://yandex.ru/video/preview/?filmId=10682852472978372885&text=summer', 'info_dict': { @@ -174,6 +174,9 @@ class YandexVideoPreviewIE(InfoExtractor): }, { # Odnoklassniki 'url': 'https://yandex.ru/video/preview/?text=Francis%20Lai%20-%20Le%20Bon%20Et%20Les%20MC)chants&path=wizard&parent-reqid=1643208087979310-1481782809207673478-sas3-0931-2f9-sas-l7-balancer-8080-BAL-9380&wiz_type=vital&filmId=12508152936505397283', 'only_matching': True, + }, { # Odnoklassniki + 'url': 'https://yandex.com/video/preview/?text=dossier%2051%20film%201978&path=yandex_search&parent-reqid=1664361087754492-8727541069609384458-sas2-0340-sas-l7-balancer-8080-BAL-8045&noreask=1&from_type=vast&filmId=5794987234584444632', + 'only_matching': True, }] def _real_extract(self, url): From 20a7304e4c7a839ab73be03a248d092173206c17 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Sat, 1 Oct 2022 01:54:05 +0900 Subject: [PATCH 203/284] [extractor/unscripted] Add extractor (#5008) Closes #4903 Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/unscripted.py | 53 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 yt_dlp/extractor/unscripted.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d8fe74413d..4d94d35633 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1971,6 +1971,7 @@ from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE +from .unscripted import UnscriptedNewsVideoIE from .uol import UOLIE from .uplynk import ( UplynkIE, diff --git a/yt_dlp/extractor/unscripted.py b/yt_dlp/extractor/unscripted.py new file mode 100644 index 0000000000..6643a71b10 --- /dev/null +++ b/yt_dlp/extractor/unscripted.py @@ -0,0 +1,53 @@ +from .common import InfoExtractor +from ..utils import parse_duration, traverse_obj + + +class UnscriptedNewsVideoIE(InfoExtractor): + _VALID_URL = r'https?://www\.unscripted\.news/videos/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.unscripted.news/videos/a-day-at-the-farmers-protest', + 'info_dict': { + 'id': '60c0a55cd1e99b1079918a57', + 'display_id': 'a-day-at-the-farmers-protest', + 'ext': 'mp4', + 'title': 'A Day at the Farmers\' Protest', + 'description': 'md5:4b3df22747a03e8f14f746dd72190384', + 'thumbnail': 'https://s3.unscripted.news/anj2/60c0a55cd1e99b1079918a57/5f199a65-c803-4a5c-8fce-2077359c3b72.jpg', + 'duration': 2251.0, + 'series': 'Ground Reports', + } + }, { + 'url': 'https://www.unscripted.news/videos/you-get-the-politicians-you-deserve-ft-shashi-tharoor', + 'info_dict': { + 'id': '5fb3afbf18ac817d341a74d8', + 'display_id': 'you-get-the-politicians-you-deserve-ft-shashi-tharoor', + 'ext': 'mp4', + 'cast': ['Avalok Langer', 'Ashwin Mehta'], + 'thumbnail': 'https://s3.unscripted.news/anj2/5fb3afbf18ac817d341a74d8/82bd7942-4f20-4cd8-98ae-83f9e814f998.jpg', + 'description': 'md5:1e91b069238a705ca3a40f87e6f1182c', + 'duration': 1046.0, + 'series': 'Dumb Questions Only', + 'title': 'You Get The Politicians You Deserve! ft. Shashi Tharoor', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['dataLocal'] + + # TODO: get subtitle from srt key + formats, subtitles = self._extract_m3u8_formats_and_subtitles(nextjs_data['alt_content'], display_id) + + return { + 'id': nextjs_data['_id'], + 'display_id': display_id, + 'title': nextjs_data.get('title') or self._og_search_title(webpage), + 'description': nextjs_data.get('sh_heading') or self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': parse_duration(nextjs_data.get('duration')), + 'series': traverse_obj(nextjs_data, ('show', 'topic')), + 'cast': traverse_obj(nextjs_data, ('cast_crew', ..., 'displayname')), + } From acf306d1f97486c8c88455cfa294d11c818d41fe Mon Sep 17 00:00:00 2001 From: tobi1805 <66414944+tobi1805@users.noreply.github.com> Date: Fri, 30 Sep 2022 18:57:15 +0200 Subject: [PATCH 204/284] [extractor/tv2] Support new url format (#5063) Closes #4973 Authored by: tobi1805 --- yt_dlp/extractor/tv2.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/tv2.py b/yt_dlp/extractor/tv2.py index 391baa6c5e..0024f72414 100644 --- a/yt_dlp/extractor/tv2.py +++ b/yt_dlp/extractor/tv2.py @@ -16,23 +16,27 @@ class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v\d*/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v(?:ideo)?\d*/(?:[^?#]+/)*(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.tv2.no/v/916509/', + 'url': 'http://www.tv2.no/v/1791207/', 'info_dict': { - 'id': '916509', + 'id': '1791207', 'ext': 'mp4', - 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', - 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', - 'timestamp': 1431715610, - 'upload_date': '20150515', - 'duration': 157, + 'title': 'Her kolliderer romsonden med asteroiden ', + 'description': 'En romsonde har krasjet inn i en asteroide i verdensrommet. Kollisjonen skjedde klokken 01:14 natt til tirsdag 27. september norsk tid. \n\nNasa kaller det sitt første forsøk på planetforsvar.', + 'timestamp': 1664238190, + 'upload_date': '20220927', + 'duration': 146, + 'thumbnail': r're:^https://.*$', 'view_count': int, 'categories': list, }, }, { 'url': 'http://www.tv2.no/v2/916509', 'only_matching': True, + }, { + 'url': 'https://www.tv2.no/video/nyhetene/her-kolliderer-romsonden-med-asteroiden/1791207/', + 'only_matching': True, }] _PROTOCOLS = ('HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -114,13 +118,13 @@ def _real_extract(self, url): class TV2ArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?!v(?:ideo)?\d*/)[^?#]+/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', + 'url': 'https://www.tv2.no/underholdning/forraeder/katarina-flatland-angrer-etter-forraeder-exit/15095188/', 'info_dict': { - 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', - 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', + 'id': '15095188', + 'title': 'Katarina Flatland angrer etter Forræder-exit', + 'description': 'SANDEFJORD (TV 2): Katarina Flatland (33) måtte følge i sine fars fotspor, da hun ble forvist fra Forræder.', }, 'playlist_count': 2, }, { @@ -138,7 +142,7 @@ def _real_extract(self, url): if not assets: # New embed pattern - for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage): + for v in re.findall(r'(?s)(?:TV2ContentboxVideo|TV2\.TV2Video)\(({.+?})\)', webpage): video = self._parse_json( v, playlist_id, transform_source=js_to_json, fatal=False) if not video: From 81b6102d2099eec78a2db9ae3d101a8503dd4f25 Mon Sep 17 00:00:00 2001 From: nixxo <nixxo@protonmail.com> Date: Fri, 30 Sep 2022 19:33:29 +0200 Subject: [PATCH 205/284] [downloader/ism] Support ec-3 codec (#5004) Closes #296 Authored by: nixxo --- test/test_InfoExtractor.py | 286 +++++++++++++++++++++++++++ test/testdata/ism/ec-3_test.Manifest | 1 + yt_dlp/downloader/ism.py | 2 + yt_dlp/extractor/common.py | 5 +- yt_dlp/utils.py | 2 +- 5 files changed, 293 insertions(+), 3 deletions(-) create mode 100644 test/testdata/ism/ec-3_test.Manifest diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index f57a29ffc7..016a2ac7f8 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1567,6 +1567,292 @@ def test_parse_ism_formats(self): ] }, ), + ( + 'ec-3_test', + 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + [{ + 'format_id': 'audio_deu_1-224', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'isma', + 'tbr': 224, + 'asr': 48000, + 'vcodec': 'none', + 'acodec': 'EC-3', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'audio', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 0, + 'height': 0, + 'fourcc': 'EC-3', + 'language': 'deu', + 'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00', + 'sampling_rate': 48000, + 'channels': 6, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'audio_ext': 'isma', + 'video_ext': 'none', + 'abr': 224, + }, { + 'format_id': 'audio_deu-127', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'isma', + 'tbr': 127, + 'asr': 48000, + 'vcodec': 'none', + 'acodec': 'AACL', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'audio', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 0, + 'height': 0, + 'fourcc': 'AACL', + 'language': 'deu', + 'codec_private_data': '1190', + 'sampling_rate': 48000, + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'audio_ext': 'isma', + 'video_ext': 'none', + 'abr': 127, + }, { + 'format_id': 'video_deu-23', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 384, + 'height': 216, + 'tbr': 23, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 384, + 'height': 216, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '000000016742C00CDB06077E5C05A808080A00000300020000030009C0C02EE0177CC6300F142AE00000000168CA8DC8', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 23, + }, { + 'format_id': 'video_deu-403', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 400, + 'height': 224, + 'tbr': 403, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 400, + 'height': 224, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D4014E98323B602D4040405000003000100000300320F1429380000000168EAECF2', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 403, + }, { + 'format_id': 'video_deu-680', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 640, + 'height': 360, + 'tbr': 680, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 640, + 'height': 360, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D401EE981405FF2E02D4040405000000300100000030320F162D3800000000168EAECF2', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 680, + }, { + 'format_id': 'video_deu-1253', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 640, + 'height': 360, + 'tbr': 1253, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 640, + 'height': 360, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D401EE981405FF2E02D4040405000000300100000030320F162D3800000000168EAECF2', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 1253, + }, { + 'format_id': 'video_deu-2121', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 768, + 'height': 432, + 'tbr': 2121, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 768, + 'height': 432, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D401EECA0601BD80B50101014000003000400000300C83C58B6580000000168E93B3C80', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 2121, + }, { + 'format_id': 'video_deu-3275', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 1280, + 'height': 720, + 'tbr': 3275, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 1280, + 'height': 720, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D4020ECA02802DD80B501010140000003004000000C83C60C65800000000168E93B3C80', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 3275, + }, { + 'format_id': 'video_deu-5300', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 1920, + 'height': 1080, + 'tbr': 5300, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 1920, + 'height': 1080, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D4028ECA03C0113F2E02D4040405000000300100000030320F18319600000000168E93B3C80', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 5300, + }, { + 'format_id': 'video_deu-8079', + 'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest', + 'ext': 'ismv', + 'width': 1920, + 'height': 1080, + 'tbr': 8079, + 'vcodec': 'AVC1', + 'acodec': 'none', + 'protocol': 'ism', + '_download_params': + { + 'stream_type': 'video', + 'duration': 370000000, + 'timescale': 10000000, + 'width': 1920, + 'height': 1080, + 'fourcc': 'AVC1', + 'language': 'deu', + 'codec_private_data': '00000001674D4028ECA03C0113F2E02D4040405000000300100000030320F18319600000000168E93B3C80', + 'channels': 2, + 'bits_per_sample': 16, + 'nal_unit_length_field': 4 + }, + 'video_ext': 'ismv', + 'audio_ext': 'none', + 'vbr': 8079, + }], + {}, + ), ] for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES: diff --git a/test/testdata/ism/ec-3_test.Manifest b/test/testdata/ism/ec-3_test.Manifest new file mode 100644 index 0000000000..45f95de73f --- /dev/null +++ b/test/testdata/ism/ec-3_test.Manifest @@ -0,0 +1 @@ +<?xml version="1.0" encoding="utf-8"?><!--Transformed by VSMT using XSL stylesheet for rule Identity--><!-- Created with Unified Streaming Platform (version=1.10.12-18737) --><SmoothStreamingMedia MajorVersion="2" MinorVersion="0" TimeScale="10000000" Duration="370000000"><StreamIndex Type="audio" QualityLevels="1" TimeScale="10000000" Language="deu" Name="audio_deu" Chunks="19" Url="QualityLevels({bitrate})/Fragments(audio_deu={start time})?noStreamProfile=1"><QualityLevel Index="0" Bitrate="127802" CodecPrivateData="1190" SamplingRate="48000" Channels="2" BitsPerSample="16" PacketSize="4" AudioTag="255" FourCC="AACL" /><c t="0" d="20053333" /><c d="20053334" /><c d="20053333" /><c d="19840000" /><c d="20053333" /><c d="20053334" /><c d="20053333" /><c d="19840000" /><c d="20053333" /><c d="20053334" /><c d="20053333" /><c d="19840000" /><c d="20053333" /><c d="20053334" /><c d="20053333" /><c d="19840000" /><c d="20053333" /><c d="20053334" /><c d="7253333" /></StreamIndex><StreamIndex Type="audio" QualityLevels="1" TimeScale="10000000" Language="deu" Name="audio_deu_1" Chunks="19" Url="QualityLevels({bitrate})/Fragments(audio_deu_1={start time})?noStreamProfile=1"><QualityLevel Index="0" Bitrate="224000" CodecPrivateData="00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00" FourCCData="0700200F00" SamplingRate="48000" Channels="6" BitsPerSample="16" PacketSize="896" AudioTag="65534" FourCC="EC-3" /><c t="0" d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="20160000" /><c d="19840000" /><c d="8320000" /></StreamIndex><StreamIndex Type="video" QualityLevels="8" TimeScale="10000000" Language="deu" Name="video_deu" Chunks="19" Url="QualityLevels({bitrate})/Fragments(video_deu={start time})?noStreamProfile=1" MaxWidth="1920" MaxHeight="1080" DisplayWidth="1920" DisplayHeight="1080"><QualityLevel Index="0" Bitrate="23909" CodecPrivateData="000000016742C00CDB06077E5C05A808080A00000300020000030009C0C02EE0177CC6300F142AE00000000168CA8DC8" MaxWidth="384" MaxHeight="216" FourCC="AVC1" /><QualityLevel Index="1" Bitrate="403188" CodecPrivateData="00000001674D4014E98323B602D4040405000003000100000300320F1429380000000168EAECF2" MaxWidth="400" MaxHeight="224" FourCC="AVC1" /><QualityLevel Index="2" Bitrate="680365" CodecPrivateData="00000001674D401EE981405FF2E02D4040405000000300100000030320F162D3800000000168EAECF2" MaxWidth="640" MaxHeight="360" FourCC="AVC1" /><QualityLevel Index="3" Bitrate="1253465" CodecPrivateData="00000001674D401EE981405FF2E02D4040405000000300100000030320F162D3800000000168EAECF2" MaxWidth="640" MaxHeight="360" FourCC="AVC1" /><QualityLevel Index="4" Bitrate="2121558" CodecPrivateData="00000001674D401EECA0601BD80B50101014000003000400000300C83C58B6580000000168E93B3C80" MaxWidth="768" MaxHeight="432" FourCC="AVC1" /><QualityLevel Index="5" Bitrate="3275545" CodecPrivateData="00000001674D4020ECA02802DD80B501010140000003004000000C83C60C65800000000168E93B3C80" MaxWidth="1280" MaxHeight="720" FourCC="AVC1" /><QualityLevel Index="6" Bitrate="5300196" CodecPrivateData="00000001674D4028ECA03C0113F2E02D4040405000000300100000030320F18319600000000168E93B3C80" MaxWidth="1920" MaxHeight="1080" FourCC="AVC1" /><QualityLevel Index="7" Bitrate="8079312" CodecPrivateData="00000001674D4028ECA03C0113F2E02D4040405000000300100000030320F18319600000000168E93B3C80" MaxWidth="1920" MaxHeight="1080" FourCC="AVC1" /><c t="0" d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="20000000" /><c d="10000000" /></StreamIndex></SmoothStreamingMedia> \ No newline at end of file diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py index 801b5af813..c961dc62e9 100644 --- a/yt_dlp/downloader/ism.py +++ b/yt_dlp/downloader/ism.py @@ -138,6 +138,8 @@ def write_piff_header(stream, params): if fourcc == 'AACL': sample_entry_box = box(b'mp4a', sample_entry_payload) + if fourcc == 'EC-3': + sample_entry_box = box(b'ec-3', sample_entry_payload) elif stream_type == 'video': sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # reserved diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d36f025ab8..11e7158714 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3124,9 +3124,10 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): stream_name = stream.get('Name') stream_language = stream.get('Language', 'und') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None) + KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} + fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 3e2ce84345..6cba9299a5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3546,7 +3546,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): COMPATIBLE_CODECS = { 'mp4': { 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd) - 'h264', 'aacl', # Set in ISM + 'h264', 'aacl', 'ec-3', # Set in ISM }, 'webm': { 'av1', 'vp9', 'vp8', 'opus', 'vrbs', From 576faf00b24963d4ab9a1a23c1ab243c13d9ce16 Mon Sep 17 00:00:00 2001 From: Itachi <sulabh.biswas.0157@gmail.com> Date: Fri, 30 Sep 2022 23:33:30 +0530 Subject: [PATCH 206/284] [extractor/Mxplayer] Fix extractor (#4966) Closes #4946 Authored by: itachi-19 --- yt_dlp/extractor/mxplayer.py | 135 ++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 64 deletions(-) diff --git a/yt_dlp/extractor/mxplayer.py b/yt_dlp/extractor/mxplayer.py index cdc340a80f..affdba10c8 100644 --- a/yt_dlp/extractor/mxplayer.py +++ b/yt_dlp/extractor/mxplayer.py @@ -1,6 +1,10 @@ from .common import InfoExtractor from ..compat import compat_str -from ..utils import try_get +from ..utils import ( + int_or_none, + traverse_obj, + try_get, +) class MxplayerIE(InfoExtractor): @@ -9,6 +13,7 @@ class MxplayerIE(InfoExtractor): 'url': 'https://www.mxplayer.in/show/watch-my-girlfriend-is-an-alien-hindi-dubbed/season-1/episode-1-online-9d2013d31d5835bb8400e3b3c5e7bb72', 'info_dict': { 'id': '9d2013d31d5835bb8400e3b3c5e7bb72', + 'display_id': 'episode-1-online', 'ext': 'mp4', 'title': 'Episode 1', 'description': 'md5:62ed43eb9fec5efde5cf3bd1040b7670', @@ -17,7 +22,6 @@ class MxplayerIE(InfoExtractor): 'duration': 2451, 'season': 'Season 1', 'series': 'My Girlfriend Is An Alien (Hindi Dubbed)', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/9d2013d31d5835bb8400e3b3c5e7bb72/en/16x9/320x180/9562f5f8df42cad09c9a9c4e69eb1567_1920x1080.webp', 'episode': 'Episode 1' }, 'params': { @@ -28,21 +32,17 @@ class MxplayerIE(InfoExtractor): 'url': 'https://www.mxplayer.in/movie/watch-knock-knock-hindi-dubbed-movie-online-b9fa28df3bfb8758874735bbd7d2655a?watch=true', 'info_dict': { 'id': 'b9fa28df3bfb8758874735bbd7d2655a', + 'display_id': 'episode-1-online', 'ext': 'mp4', 'title': 'Knock Knock (Hindi Dubbed)', - 'description': 'md5:b195ba93ff1987309cfa58e2839d2a5b', - 'season_number': 0, - 'episode_number': 0, + 'description': 'md5:4160f2dfc3b87c524261366f6b736329', 'duration': 5970, - 'season': 'Season 0', - 'series': None, - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/b9fa28df3bfb8758874735bbd7d2655a/en/16x9/320x180/test_pic1588676032011.webp', - 'episode': 'Episode 0' }, 'params': { 'format': 'bv', 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'https://www.mxplayer.in/show/watch-shaitaan/season-1/the-infamous-taxi-gang-of-meerut-online-45055d5bcff169ad48f2ad7552a83d6c', 'info_dict': { @@ -55,26 +55,26 @@ class MxplayerIE(InfoExtractor): 'duration': 2332, 'season': 'Season 1', 'series': 'Shaitaan', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/45055d5bcff169ad48f2ad7552a83d6c/en/16x9/320x180/voot_8e7d5f8d8183340869279c732c1e3a43.webp', 'episode': 'Episode 1' }, 'params': { 'format': 'best', 'skip_download': True, }, + 'skip': 'No longer available.' }, { 'url': 'https://www.mxplayer.in/show/watch-aashram/chapter-1/duh-swapna-online-d445579792b0135598ba1bc9088a84cb', 'info_dict': { 'id': 'd445579792b0135598ba1bc9088a84cb', + 'display_id': 'duh-swapna-online', 'ext': 'mp4', 'title': 'Duh Swapna', 'description': 'md5:35ff39c4bdac403c53be1e16a04192d8', 'season_number': 1, 'episode_number': 3, 'duration': 2568, - 'season': 'Chapter 1', + 'season': 'Season 1', 'series': 'Aashram', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/d445579792b0135598ba1bc9088a84cb/en/4x3/1600x1200/test_pic1624819307993.webp', 'episode': 'Episode 3' }, 'params': { @@ -85,6 +85,7 @@ class MxplayerIE(InfoExtractor): 'url': 'https://www.mxplayer.in/show/watch-dangerous/season-1/chapter-1-online-5a351b4f9fb69436f6bd6ae3a1a75292', 'info_dict': { 'id': '5a351b4f9fb69436f6bd6ae3a1a75292', + 'display_id': 'chapter-1-online', 'ext': 'mp4', 'title': 'Chapter 1', 'description': 'md5:233886b8598bc91648ac098abe1d288f', @@ -93,7 +94,6 @@ class MxplayerIE(InfoExtractor): 'duration': 1305, 'season': 'Season 1', 'series': 'Dangerous', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/5a351b4f9fb69436f6bd6ae3a1a75292/en/4x3/1600x1200/test_pic1624706302350.webp', 'episode': 'Episode 1' }, 'params': { @@ -107,72 +107,79 @@ class MxplayerIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Attacks of 26/11', 'description': 'md5:689bacd29e97b3f31eaf519eb14127e5', - 'season_number': 0, - 'episode_number': 0, 'duration': 6085, - 'season': 'Season 0', - 'series': None, - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/0452f0d80226c398d63ce7e3ea40fa2d/en/16x9/320x180/00c8955dab5e5d340dbde643f9b1f6fd_1920x1080.webp', - 'episode': 'Episode 0' }, 'params': { 'format': 'best', 'skip_download': True, }, + 'skip': 'No longer available. Cannot be played on browser' + }, { + 'url': 'https://www.mxplayer.in/movie/watch-kitne-door-kitne-paas-movie-online-a9e9c76c566205955f70d8b2cb88a6a2', + 'info_dict': { + 'id': 'a9e9c76c566205955f70d8b2cb88a6a2', + 'display_id': 'watch-kitne-door-kitne-paas-movie-online', + 'title': 'Kitne Door Kitne Paas', + 'duration': 8458, + 'ext': 'mp4', + 'description': 'md5:fb825f3c542513088024dcafef0921b4', + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/show/watch-ek-thi-begum-hindi/season-2/game-of-power-online-5e5305c28f1409847cdc4520b6ad77cf', + 'info_dict': { + 'id': '5e5305c28f1409847cdc4520b6ad77cf', + 'display_id': 'game-of-power-online', + 'title': 'Game Of Power', + 'duration': 1845, + 'ext': 'mp4', + 'description': 'md5:1d0948d2a5312d7013792d53542407f9', + 'series': 'Ek Thi Begum (Hindi)', + 'season': 'Season 2', + 'season_number': 2, + 'episode': 'Episode 2', + 'episode_number': 2, + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, }] def _real_extract(self, url): - type, display_id, video_id = self._match_valid_url(url).groups() - type = 'movie_film' if type == 'movie' else 'tvshow_episode' - API_URL = 'https://androidapi.mxplay.com/v1/detail/' - headers = { - 'X-Av-Code': '23', - 'X-Country': 'IN', - 'X-Platform': 'android', - 'X-App-Version': '1370001318', - 'X-Resolution': '3840x2160', - } - data_json = self._download_json(f'{API_URL}{type}/{video_id}', display_id, headers=headers)['profile'] + video_type, display_id, video_id = self._match_valid_url(url).group('type', 'display_id', 'id') + if 'show' in video_type: + video_type = 'episode' - season, series = None, None - for dct in data_json.get('levelInfos', []): - if dct.get('type') == 'tvshow_season': - season = dct.get('name') - elif dct.get('type') == 'tvshow_show': - series = dct.get('name') - thumbnails = [] - for thumb in data_json.get('poster', []): - thumbnails.append({ - 'url': thumb.get('url'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - }) + data_json = self._download_json( + f'https://api.mxplay.com/v1/web/detail/video?type={video_type}&id={video_id}', display_id) - formats = [] - subtitles = {} - for dct in data_json.get('playInfo', []): - if dct.get('extension') == 'mpd': - frmt, subs = self._extract_mpd_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) - formats.extend(frmt) - subtitles = self._merge_subtitles(subtitles, subs) - elif dct.get('extension') == 'm3u8': - frmt, subs = self._extract_m3u8_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) - formats.extend(frmt) - subtitles = self._merge_subtitles(subtitles, subs) + streams = traverse_obj(data_json, ('stream', {'m3u8': ('hls', 'high'), 'mpd': ('dash', 'high')})) + formats, dash_subs = self._extract_mpd_formats_and_subtitles( + f'https://llvod.mxplay.com/{streams["mpd"]}', display_id, fatal=False) + hls_frmts, hls_subs = self._extract_m3u8_formats_and_subtitles( + f'https://llvod.mxplay.com/{streams["m3u8"]}', display_id, fatal=False) + + formats.extend(hls_frmts) self._sort_formats(formats) + + season = traverse_obj(data_json, ('container', 'title')) return { 'id': video_id, - 'display_id': display_id, - 'title': data_json.get('name') or display_id, - 'description': data_json.get('description'), - 'season_number': data_json.get('seasonNum'), - 'episode_number': data_json.get('episodeNum'), - 'duration': data_json.get('duration'), - 'season': season, - 'series': series, - 'thumbnails': thumbnails, + 'title': data_json.get('title'), 'formats': formats, - 'subtitles': subtitles, + 'subtitles': self._merge_subtitles(dash_subs, hls_subs), + 'display_id': display_id, + 'duration': data_json.get('duration'), + 'series': traverse_obj(data_json, ('container', 'container', 'title')), + 'description': data_json.get('description'), + 'season': season, + 'season_number': int_or_none( + self._search_regex(r'Season (\d+)', season, 'Season Number', default=None)), + 'episode_number': data_json.get('sequence') or None, } From af7a5eef2f0fce13dbeb375cb97f316292a694c7 Mon Sep 17 00:00:00 2001 From: std-move <26625259+std-move@users.noreply.github.com> Date: Sat, 1 Oct 2022 17:30:14 +0200 Subject: [PATCH 207/284] [downloader/aria2c] Fix filename containing leading whitespace (#5099) Similar to eb55bad5a0c1af9388301ffbf17845ee53a41635, but for fragmented downloads Authored by: std-move --- yt_dlp/downloader/external.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index d117c06e0a..895390d6cf 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -252,6 +252,10 @@ def supports_manifest(manifest): check_results = (not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) return all(check_results) + @staticmethod + def _aria2c_filename(fn): + return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', @@ -280,11 +284,9 @@ def _make_cmd(self, tmpfilename, info_dict): # https://github.com/aria2/aria2/issues/1373 dn = os.path.dirname(tmpfilename) if dn: - if not os.path.isabs(dn): - dn = f'.{os.path.sep}{dn}' - cmd += ['--dir', dn + os.path.sep] + cmd += ['--dir', self._aria2c_filename(dn) + os.path.sep] if 'fragments' not in info_dict: - cmd += ['--out', f'.{os.path.sep}{os.path.basename(tmpfilename)}'] + cmd += ['--out', self._aria2c_filename(os.path.basename(tmpfilename))] cmd += ['--auto-file-renaming=false'] if 'fragments' in info_dict: @@ -293,11 +295,11 @@ def _make_cmd(self, tmpfilename, info_dict): url_list = [] for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) - url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) + url_list.append('%s\n\tout=%s' % (fragment['url'], self._aria2c_filename(fragment_filename))) stream, _ = self.sanitize_open(url_list_file, 'wb') stream.write('\n'.join(url_list).encode()) stream.close() - cmd += ['-i', url_list_file] + cmd += ['-i', self._aria2c_filename(url_list_file)] else: cmd += ['--', info_dict['url']] return cmd From 573a98d6f0867f9acb909cb3ff3dc9c10f9b2e8b Mon Sep 17 00:00:00 2001 From: Dhruv <74945202+0xGodspeed@users.noreply.github.com> Date: Sun, 2 Oct 2022 03:37:09 +0530 Subject: [PATCH 208/284] [extractor/bongacams] Update `_VALID_URL` (#5104) Closes #5075 Authored by: 0xGodspeed --- yt_dlp/extractor/bongacams.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bongacams.py b/yt_dlp/extractor/bongacams.py index cbef0fc53a..9ba166b043 100644 --- a/yt_dlp/extractor/bongacams.py +++ b/yt_dlp/extractor/bongacams.py @@ -8,13 +8,28 @@ class BongaCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://de.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://cn.bongacams.com/azumi-8', 'only_matching': True, + }, { + 'url': 'https://de.bongacams.net/claireashton', + 'info_dict': { + 'id': 'claireashton', + 'ext': 'mp4', + 'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'age_limit': 18, + 'uploader_id': 'ClaireAshton', + 'uploader': 'ClaireAshton', + 'like_count': int, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): From a83333c4328591c279a27dd0ec4c7c5addcc411f Mon Sep 17 00:00:00 2001 From: Teemu Ikonen <tpikonen@gmail.com> Date: Mon, 3 Oct 2022 00:23:48 +0300 Subject: [PATCH 209/284] [extractor/iltalehti] Add extractor (#5117) Authored by: tpikonen --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/iltalehti.py | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 yt_dlp/extractor/iltalehti.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4d94d35633..f104b3e35e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -718,6 +718,7 @@ IHeartRadioIE, IHeartRadioPodcastIE, ) +from .iltalehti import IltalehtiIE from .imdb import ( ImdbIE, ImdbListIE diff --git a/yt_dlp/extractor/iltalehti.py b/yt_dlp/extractor/iltalehti.py new file mode 100644 index 0000000000..a40307aed4 --- /dev/null +++ b/yt_dlp/extractor/iltalehti.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class IltalehtiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iltalehti\.fi/[^/?#]+/a/(?P<id>[^/?#])' + _TESTS = [ + # jwplatform embed main_media + { + 'url': 'https://www.iltalehti.fi/ulkomaat/a/9fbd067f-94e4-46cd-8748-9d958eb4dae2', + 'md5': 'af12d42c539f1f49f0b62d231fe72dcd', + 'info_dict': { + 'id': 'gYjjaf1L', + 'ext': 'mp4', + 'title': 'Sensuroimaton Päivärinta, jakso 227: Vieraana Suomen Venäjän ex-suurlähettiläs René Nyberg ja Kenraalimajuri evp Pekka Toveri', + 'description': '', + 'upload_date': '20220928', + 'timestamp': 1664360878, + 'duration': 2089, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + # jwplatform embed body + { + 'url': 'https://www.iltalehti.fi/politiikka/a/1ce49d85-1670-428b-8db8-d2479b9950a4', + 'md5': '9e50334b8f8330ce8828b567a82a3c65', + 'info_dict': { + 'id': '18R6zkLi', + 'ext': 'mp4', + 'title': 'Pekka Toverin arvio: Näin Nord Stream -kaasuputken räjäyttäminen on saatettu toteuttaa', + 'description': 'md5:3d1302c9e17e7ffd564143ff58f8de35', + 'upload_date': '20220929', + 'timestamp': 1664435867, + 'duration': 165.0, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + ] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + info = self._search_json( + r'<script>\s*window.App\s*=\s*', webpage, 'json', article_id, + transform_source=js_to_json) + props = traverse_obj(info, ( + 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) + video_ids = traverse_obj(props, (lambda _, v: v['provider'] == 'jwplayer', 'id')) + return self.playlist_from_matches( + video_ids, article_id, ie='JWPlatform', getter=lambda id: f'jwplatform:{id}', + title=traverse_obj(info, ('state', 'articles', ..., 'items', 'canonical_title'), get_all=False)) From 8b7fb8b60da78b54a518246b251be3d1829fef38 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Mon, 3 Oct 2022 16:50:27 +0530 Subject: [PATCH 210/284] [extractor] Make search_json able to parse lists Now `contains_pattern` can be set to `\[.+\]` --- yt_dlp/extractor/common.py | 4 ++-- yt_dlp/extractor/dropbox.py | 2 +- yt_dlp/extractor/radiofrance.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 11e7158714..caec0ccf62 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1227,7 +1227,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f return None def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', - contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs): + contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs): """Searches string for the JSON object specified by start_pattern""" # NB: end_pattern is only used to reduce the size of the initial match if default is NO_DEFAULT: @@ -1236,7 +1236,7 @@ def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal, has_default = False, True json_string = self._search_regex( - rf'(?:{start_pattern})\s*(?P<json>{{\s*(?:{contains_pattern})\s*}})\s*(?:{end_pattern})', + rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})', string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) if not json_string: return default diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index 0d12513b29..54d97a25dc 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -54,7 +54,7 @@ def _real_extract(self, url): raise ExtractorError('Password protected video, use --video-password <password>', expected=True) info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, - contains_pattern=r'.+?"preview".+?', end_pattern=r'\)')['props'] + contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 7b60b2617b..38420a15d6 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -84,7 +84,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 - video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+') + video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') return { 'id': video_id, From 8a04054647d40037499e446cd6c1099cdd46f4c8 Mon Sep 17 00:00:00 2001 From: Nitish Kumar <snapdgnn@proton.me> Date: Mon, 3 Oct 2022 18:17:52 +0530 Subject: [PATCH 211/284] [extractor/hrfensehen] Fix extractor (#5096) Authored by: snapdgn --- yt_dlp/extractor/hrfensehen.py | 53 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py index 6f7ed9b4bc..dd72d86d77 100644 --- a/yt_dlp/extractor/hrfensehen.py +++ b/yt_dlp/extractor/hrfensehen.py @@ -1,14 +1,19 @@ import json import re -from ..utils import int_or_none, unified_timestamp, unescapeHTML +from ..utils import ( + int_or_none, + traverse_obj, + try_call, + unescapeHTML, + unified_timestamp, +) from .common import InfoExtractor class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' - _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', @@ -21,10 +26,11 @@ class HRFernsehenIE(InfoExtractor): 'subtitles': {'de': [{ 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' }]}, - 'timestamp': 1598470200, + 'timestamp': 1598400000, 'upload_date': '20200826', - 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', - 'title': 'hessenschau vom 26.08.2020' + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', + 'title': 'hessenschau vom 26.08.2020', + 'duration': 1654 } }, { 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', @@ -33,25 +39,18 @@ class HRFernsehenIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - def extract_airdate(self, loader_data): - airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') - - if airdate_str is None: - return None - - return unified_timestamp(airdate_str) - def extract_formats(self, loader_data): stream_formats = [] - for stream_obj in loader_data["videoResolutionLevels"]: + data = loader_data['mediaCollection']['streams'][0]['media'] + for inner in data[1:]: stream_format = { - 'format_id': str(stream_obj['verticalResolution']) + "p", - 'height': stream_obj['verticalResolution'], - 'url': stream_obj['url'], + 'format_id': try_call(lambda: f'{inner["maxHResolutionPx"]}p'), + 'height': inner.get('maxHResolutionPx'), + 'url': inner['url'], } quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', - stream_obj['url']) + inner['url']) if quality_information: stream_format['width'] = int_or_none(quality_information.group(1)) stream_format['height'] = int_or_none(quality_information.group(2)) @@ -72,22 +71,22 @@ def _real_extract(self, url): description = self._html_search_meta( ['description'], webpage) - loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_str = unescapeHTML(self._search_regex(r"data-(?:new-)?hr-mediaplayer-loader='([^']*)'", webpage, 'ardloader')) loader_data = json.loads(loader_str) + subtitle = traverse_obj(loader_data, ('mediaCollection', 'subTitles', 0, 'sources', 0, 'url')) + info = { 'id': video_id, 'title': title, 'description': description, 'formats': self.extract_formats(loader_data), - 'timestamp': self.extract_airdate(loader_data) + 'subtitles': {'de': [{'url': subtitle}]}, + 'timestamp': unified_timestamp(self._search_regex( + r'<time\sdatetime="(\d{4}\W\d{1,2}\W\d{1,2})', webpage, 'datetime', fatal=False)), + 'duration': int_or_none(traverse_obj( + loader_data, ('playerConfig', 'pluginData', 'trackingAti@all', 'richMedia', 'duration'))), + 'thumbnail': self._search_regex(r'thumbnailUrl\W*([^"]+)', webpage, 'thumbnail', default=None), } - if "subtitle" in loader_data: - info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} - - thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) - if len(thumbnails) > 0: - info["thumbnails"] = [{"url": t} for t in thumbnails] - return info From eb2d9504b91c4ca3b10a90302df53b867924e86b Mon Sep 17 00:00:00 2001 From: zenerdi0de <83358565+zenerdi0de@users.noreply.github.com> Date: Mon, 3 Oct 2022 18:37:09 +0530 Subject: [PATCH 212/284] [extractor/tennistv] Fix timestamp (#5085) Authored by: zenerdi0de --- yt_dlp/extractor/tennistv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/tennistv.py b/yt_dlp/extractor/tennistv.py index 3bd7ce3c43..5baa21d52a 100644 --- a/yt_dlp/extractor/tennistv.py +++ b/yt_dlp/extractor/tennistv.py @@ -148,7 +148,7 @@ def _real_extract(self, url): webpage, 'description', fatal=False), 'thumbnail': f'https://open.http.mp.streamamg.com/p/{self._PARTNER_ID}/sp/{self._PARTNER_ID}00/thumbnail/entry_id/{entryid}/version/100001/height/1920', 'timestamp': unified_timestamp(self._html_search_regex( - r'<span itemprop="description" content=["\']([^"\']+)["\']>', webpage, 'upload time')), + r'<span itemprop="uploadDate" content=["\']([^"\']+)["\']>', webpage, 'upload time', fatal=False)), 'series': self._html_search_regex(r'data-series\s*?=\s*?"(.*?)"', webpage, 'series', fatal=False) or None, 'season': self._html_search_regex(r'data-tournament-city\s*?=\s*?"(.*?)"', webpage, 'season', fatal=False) or None, 'episode': self._html_search_regex(r'data-round\s*?=\s*?"(.*?)"', webpage, 'round', fatal=False) or None, From f48ab881f6a75fbc61f7d9c132180f7696db95f8 Mon Sep 17 00:00:00 2001 From: Fabi019 <fabi019@gmx.de> Date: Mon, 3 Oct 2022 15:40:09 +0200 Subject: [PATCH 213/284] [extractor/bundesliga] Add extractor (#5094) Closes #2339 Authored by: Fabi019 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bundesliga.py | 34 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 yt_dlp/extractor/bundesliga.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f104b3e35e..f4d7c3ab5e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -233,6 +233,7 @@ BrightcoveNewIE, ) from .businessinsider import BusinessInsiderIE +from .bundesliga import BundesligaIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/yt_dlp/extractor/bundesliga.py b/yt_dlp/extractor/bundesliga.py new file mode 100644 index 0000000000..e76dd58ddb --- /dev/null +++ b/yt_dlp/extractor/bundesliga.py @@ -0,0 +1,34 @@ +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BundesligaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bundesliga\.com/[a-z]{2}/bundesliga/videos(?:/[^?]+)?\?vid=(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [ + { + 'url': 'https://www.bundesliga.com/en/bundesliga/videos?vid=bhhHkKyN', + 'md5': '8fc3b25cd12440e3a8cdc51f1493849c', + 'info_dict': { + 'id': 'bhhHkKyN', + 'ext': 'mp4', + 'title': 'Watch: Alphonso Davies and Jeremie Frimpong head-to-head', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/bhhHkKyN/poster.jpg?width=720', + 'upload_date': '20220928', + 'duration': 146, + 'timestamp': 1664366511, + 'description': 'md5:803d4411bd134140c774021dd4b7598b' + } + }, + { + 'url': 'https://www.bundesliga.com/en/bundesliga/videos/latest-features/T8IKc8TX?vid=ROHjs06G', + 'only_matching': True + }, + { + 'url': 'https://www.bundesliga.com/en/bundesliga/videos/goals?vid=mOG56vWA', + 'only_matching': True + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'jwplatform:{video_id}', JWPlatformIE, video_id) From 177662e0f24bfd54e57b87698739d7a518321bac Mon Sep 17 00:00:00 2001 From: sam <mail@samueljenks.me> Date: Tue, 4 Oct 2022 02:52:30 +1300 Subject: [PATCH 214/284] [extractor/MicrosoftEmbed] Add extractor (#5082) Closes #2638 Authored by: DoubleCouponDay --- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/microsoftembed.py | 70 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/microsoftembed.py diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4fcf1f5cc7..bc6de49267 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3640,7 +3640,7 @@ def render_thumbnails_table(self, info_dict): return None return render_table( self._list_format_headers('ID', 'Width', 'Height', 'URL'), - [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]) + [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails]) def render_subtitles_table(self, video_id, subtitles): def _row(lang, formats): diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f4d7c3ab5e..3a92c1d028 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -960,6 +960,7 @@ MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .microsoftembed import MicrosoftEmbedIE from .mildom import ( MildomIE, MildomVodIE, diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py new file mode 100644 index 0000000000..8cdf66778b --- /dev/null +++ b/yt_dlp/extractor/microsoftembed.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + + +class MicrosoftEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?microsoft\.com/(?:[^/]+/)?videoplayer/embed/(?P<id>[a-z0-9A-Z]+)' + + _TESTS = [{ + 'url': 'https://www.microsoft.com/en-us/videoplayer/embed/RWL07e', + 'md5': 'eb0ae9007f9b305f9acd0a03e74cb1a9', + 'info_dict': { + 'id': 'RWL07e', + 'title': 'Microsoft for Public Health and Social Services', + 'ext': 'mp4', + 'thumbnail': 'http://img-prod-cms-rt-microsoft-com.akamaized.net/cms/api/am/imageFileData/RWL7Ju?ver=cae5', + 'age_limit': 0, + 'timestamp': 1631658316, + 'upload_date': '20210914' + } + }] + _API_URL = 'https://prod-video-cms-rt-microsoft-com.akamaized.net/vhs/api/videos/' + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json(self._API_URL + video_id, video_id) + + formats = [] + for source_type, source in metadata['streams'].items(): + if source_type == 'smooth_Streaming': + formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss')) + elif source_type == 'apple_HTTP_Live_Streaming': + formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4')) + elif source_type == 'mPEG_DASH': + formats.extend(self._extract_mpd_formats(source['url'], video_id)) + else: + formats.append({ + 'format_id': source_type, + 'url': source['url'], + 'height': source.get('heightPixels'), + 'width': source.get('widthPixels'), + }) + self._sort_formats(formats) + + subtitles = { + lang: [{ + 'url': data.get('url'), + 'ext': 'vtt', + }] for lang, data in traverse_obj(metadata, 'captions', default={}).items() + } + + thumbnails = [{ + 'url': thumb.get('url'), + 'width': thumb.get('width') or None, + 'height': thumb.get('height') or None, + } for thumb in traverse_obj(metadata, ('snippet', 'thumbnails', ...))] + self._remove_duplicate_formats(thumbnails) + + return { + 'id': video_id, + 'title': traverse_obj(metadata, ('snippet', 'title')), + 'timestamp': unified_timestamp(traverse_obj(metadata, ('snippet', 'activeStartDate'))), + 'age_limit': int_or_none(traverse_obj(metadata, ('snippet', 'minimumAge'))) or 0, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } From 7244895bde622c6aa0f2d858af1989c4b4f7b4aa Mon Sep 17 00:00:00 2001 From: m4tu4g <71326926+m4tu4g@users.noreply.github.com> Date: Mon, 3 Oct 2022 19:42:56 +0530 Subject: [PATCH 215/284] [extractor/zee5] Fix `_VALID_URL` (#5124) Closes #4612 Authored by: m4tu4g --- yt_dlp/extractor/zee5.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index d0229e78b5..a030e6f219 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -23,7 +23,7 @@ class Zee5IE(InfoExtractor): https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? (?: (?:tv-shows|kids|web-series|zee5originals)(?:/[^#/?]+){3} - |movies/[^#/?]+ + |(?:movies|kids|videos)/(?!kids-shows)[^#/?]+ )/(?P<display_id>[^#/?]+)/ ) (?P<id>[^#/?]+)/?(?:$|[?#]) @@ -84,6 +84,9 @@ class Zee5IE(InfoExtractor): }, { 'url': 'https://www.zee5.com/web-series/details/mithya/0-6-4z587408/maine-dekhi-hai-uski-mrityu/0-1-6z587412', 'only_matching': True + }, { + 'url': 'https://www.zee5.com/kids/kids-movies/maya-bommalu/0-0-movie_1040370005', + 'only_matching': True }] _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') @@ -176,7 +179,7 @@ class Zee5SeriesIE(InfoExtractor): (?: zee5:series:| https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? - (?:tv-shows|web-series|kids|zee5originals)(?:/[^#/?]+){2}/ + (?:tv-shows|web-series|kids|zee5originals)/(?!kids-movies)(?:[^#/?]+/){2} ) (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#]) ''' From 4a61501db9369c813f913dc491c36951f8b087ad Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 3 Oct 2022 16:15:22 +0000 Subject: [PATCH 216/284] [extractor/anvato] Fix extractor and refactor (#5074) Authored by: bashonly --- Makefile | 3 +- setup.py | 1 - yt_dlp/extractor/anvato.py | 189 +++++++++++------- .../anvato_token_generator/__init__.py | 5 - .../anvato_token_generator/common.py | 3 - .../extractor/anvato_token_generator/nfl.py | 28 --- 6 files changed, 116 insertions(+), 113 deletions(-) delete mode 100644 yt_dlp/extractor/anvato_token_generator/__init__.py delete mode 100644 yt_dlp/extractor/anvato_token_generator/common.py delete mode 100644 yt_dlp/extractor/anvato_token_generator/nfl.py diff --git a/Makefile b/Makefile index 6cb9e2f57e..19a377002b 100644 --- a/Makefile +++ b/Makefile @@ -74,8 +74,7 @@ offlinetest: codetest $(PYTHON) -m pytest -k "not download" # XXX: This is hard to maintain -CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat \ - yt_dlp/extractor/anvato_token_generator +CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip for d in $(CODE_FOLDERS) ; do \ diff --git a/setup.py b/setup.py index e376a694a3..3641dfae95 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,6 @@ def packages(): return [ 'yt_dlp', 'yt_dlp.extractor', 'yt_dlp.downloader', 'yt_dlp.postprocessor', 'yt_dlp.compat', - 'yt_dlp.extractor.anvato_token_generator', ] diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index cb94835693..5d03070852 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -5,10 +5,8 @@ import re import time -from .anvato_token_generator import NFLTokenGenerator from .common import InfoExtractor from ..aes import aes_encrypt -from ..compat import compat_str from ..utils import ( bytes_to_intlist, determine_ext, @@ -16,20 +14,61 @@ int_or_none, join_nonempty, strip_jsonp, + smuggle_url, + traverse_obj, unescapeHTML, unsmuggle_url, ) def md5_text(s): - if not isinstance(s, compat_str): - s = compat_str(s) - return hashlib.md5(s.encode('utf-8')).hexdigest() + return hashlib.md5(str(s).encode()).hexdigest() class AnvatoIE(InfoExtractor): _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + _API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2' + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js + + _TESTS = [{ + # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14 + 'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441', + 'md5': '921919dab3cd0b849ff3d624831ae3e2', + 'info_dict': { + 'id': '899441', + 'ext': 'mp4', + 'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14', + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'NFL', + 'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights', + 'Player Highlights', 'Cleveland Browns', 'league'], + 'duration': 157, + 'categories': ['Entertainment', 'Game', 'Highlights'], + }, + }, { + # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/ + 'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455', + 'md5': '837718bcfb3a7778d022f857f7a9b19e', + 'info_dict': { + 'id': '8032455', + 'ext': 'mp4', + 'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream', + 'description': 'md5:0a12bab8159445e78f52a297a35c6609', + 'upload_date': '20220928', + 'timestamp': 1664408881, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'LIN', + 'tags': ['video', 'news', '5live'], + 'duration': 155, + 'categories': ['News'], + }, + }] + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -202,86 +241,74 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + def _generate_nfl_token(self, anvack, mcp_id): + reroute = self._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}, note='Fetching token info') + token_type = reroute.get('token_type') or 'Bearer' + auth_token = f'{token_type} {reroute["access_token"]}' + response = self._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token + } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': auth_token, + 'Content-Type': 'application/json', + }, note='Fetching NFL API token') + return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token')) + _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token, } - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' - - _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' - _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' - - _TESTS = [{ - # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 - 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', - 'info_dict': { - 'id': '4465496', - 'ext': 'mp4', - 'title': 'VIDEO: Humpback whale breaches right next to NH boat', - 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', - 'duration': 22, - 'timestamp': 1534855680, - 'upload_date': '20180821', - 'uploader': 'ANV', - }, - 'params': { - 'skip_download': True, - }, - }, { - # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ - 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', - 'only_matching': True, - }] - - def __init__(self, *args, **kwargs): - super(AnvatoIE, self).__init__(*args, **kwargs) - self.__server_time = None - def _server_time(self, access_key, video_id): - if self.__server_time is not None: - return self.__server_time + return int_or_none(traverse_obj(self._download_json( + f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key}, + note='Fetching server time', fatal=False), 'server_time')) or int(time.time()) - self.__server_time = int(self._download_json( - self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, - note='Fetching server time')['server_time']) - - return self.__server_time - - def _api_prefix(self, access_key): - return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') - - def _get_video_json(self, access_key, video_id): + def _get_video_json(self, access_key, video_id, extracted_token): # See et() in anvplayer.min.js, which is an alias of getVideoJSON() - video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}' server_time = self._server_time(access_key, video_id) - input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}' auth_secret = intlist_to_bytes(aes_encrypt( bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) - - video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + query = { + 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), + 'rtyp': 'fp', + } anvrid = md5_text(time.time() * 1000 * random.random())[:30] api = { 'anvrid': anvrid, 'anvts': server_time, } - if self._TOKEN_GENERATORS.get(access_key) is not None: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + if extracted_token is not None: + api['anvstk2'] = extracted_token + elif self._TOKEN_GENERATORS.get(access_key) is not None: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id) + elif self._ANVACK_TABLE.get(access_key) is not None: + api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}') else: - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + api['anvstk2'] = 'default' return self._download_json( - video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps({'api': api}).encode('utf-8')) + video_data_url, video_id, transform_source=strip_jsonp, query=query, + data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8')) - def _get_anvato_videos(self, access_key, video_id): - video_data = self._get_video_json(access_key, video_id) + def _get_anvato_videos(self, access_key, video_id, token): + video_data = self._get_video_json(access_key, video_id, token) formats = [] for published_url in video_data['published_urls']: - video_url = published_url['embed_url'] + video_url = published_url.get('embed_url') + if not video_url: + continue media_format = published_url.get('format') ext = determine_ext(video_url) @@ -296,15 +323,27 @@ def _get_anvato_videos(self, access_key, video_id): 'tbr': tbr or None, } - if media_format == 'm3u8' and tbr is not None: + vtt_subs, hls_subs = {}, {} + if media_format == 'vtt': + _, vtt_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, m3u8_id='vtt', fatal=False) + continue + elif media_format == 'm3u8' and tbr is not None: a_format.update({ 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + # For some videos the initial m3u8 URL returns JSON instead + manifest_json = self._download_json( + video_url, video_id, note='Downloading manifest JSON', errnote=False) + if manifest_json: + video_url = manifest_json.get('master_m3u8') + if not video_url: + continue + hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) continue elif ext == 'mp3' or media_format == 'mp3': a_format['vcodec'] = 'none' @@ -324,6 +363,7 @@ def _get_anvato_videos(self, access_key, video_id): 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None } subtitles.setdefault(caption['language'], []).append(a_caption) + subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs) return { 'id': video_id, @@ -349,7 +389,10 @@ def _extract_from_webpage(cls, url, webpage): access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) if not (video_id or '').isdigit() or not access_key: continue - yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id) + url = f'anvato:{access_key}:{video_id}' + if anvplayer_data.get('token'): + url = smuggle_url(url, {'token': anvplayer_data['token']}) + yield cls.url_result(url, AnvatoIE, video_id) def _extract_anvato_videos(self, webpage, video_id): anvplayer_data = self._parse_json( @@ -357,7 +400,7 @@ def _extract_anvato_videos(self, webpage, video_id): self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), video_id) return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video']) + anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -365,9 +408,7 @@ def _real_extract(self, url): 'countries': smuggled_data.get('geo_countries'), }) - mobj = self._match_valid_url(url) - access_key, video_id = mobj.group('access_key_or_mcp', 'id') + access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( - access_key) or access_key - return self._get_anvato_videos(access_key, video_id) + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key + return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token')) diff --git a/yt_dlp/extractor/anvato_token_generator/__init__.py b/yt_dlp/extractor/anvato_token_generator/__init__.py deleted file mode 100644 index 6530caf530..0000000000 --- a/yt_dlp/extractor/anvato_token_generator/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .nfl import NFLTokenGenerator - -__all__ = [ - 'NFLTokenGenerator', -] diff --git a/yt_dlp/extractor/anvato_token_generator/common.py b/yt_dlp/extractor/anvato_token_generator/common.py deleted file mode 100644 index 3800b5808e..0000000000 --- a/yt_dlp/extractor/anvato_token_generator/common.py +++ /dev/null @@ -1,3 +0,0 @@ -class TokenGenerator: - def generate(self, anvack, mcp_id): - raise NotImplementedError('This method must be implemented by subclasses') diff --git a/yt_dlp/extractor/anvato_token_generator/nfl.py b/yt_dlp/extractor/anvato_token_generator/nfl.py deleted file mode 100644 index 9ee4aa002e..0000000000 --- a/yt_dlp/extractor/anvato_token_generator/nfl.py +++ /dev/null @@ -1,28 +0,0 @@ -import json - -from .common import TokenGenerator - - -class NFLTokenGenerator(TokenGenerator): - _AUTHORIZATION = None - - def generate(ie, anvack, mcp_id): - if not NFLTokenGenerator._AUTHORIZATION: - reroute = ie._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, - data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}) - NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) - return ie._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), - }).encode(), headers={ - 'Authorization': NFLTokenGenerator._AUTHORIZATION, - 'Content-Type': 'application/json', - })['data']['viewer']['mediaToken']['token'] From 8671f995cc5296f1bc9f68afc886353b5a9e40aa Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 3 Oct 2022 19:35:05 +0000 Subject: [PATCH 217/284] [extractor/paramountplus] Better DRM detection (#5126) Closes #5119 Authored by: bashonly --- yt_dlp/extractor/paramountplus.py | 63 +++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py index 7987d77c6e..fb6d07ac7c 100644 --- a/yt_dlp/extractor/paramountplus.py +++ b/yt_dlp/extractor/paramountplus.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from .cbs import CBSBaseIE from ..utils import ( + ExtractorError, int_or_none, url_or_none, ) @@ -24,14 +25,22 @@ class ParamountPlusIE(CBSBaseIE): 'ext': 'mp4', 'title': 'CatDog - Climb Every CatDog/The Canine Mutiny', 'description': 'md5:7ac835000645a69933df226940e3c859', - 'duration': 1418, + 'duration': 1426, 'timestamp': 920264400, 'upload_date': '19990301', 'uploader': 'CBSI-NEW', + 'episode_number': 5, + 'thumbnail': r're:https?://.+\.jpg$', + 'season': 'Season 2', + 'chapters': 'count:3', + 'episode': 'Episode 5', + 'season_number': 2, + 'series': 'CatDog', }, 'params': { 'skip_download': 'm3u8', }, + 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this }, { 'url': 'https://www.paramountplus.com/shows/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/', 'info_dict': { @@ -43,10 +52,18 @@ class ParamountPlusIE(CBSBaseIE): 'timestamp': 1627063200, 'upload_date': '20210723', 'uploader': 'CBSI-NEW', + 'episode_number': 81, + 'thumbnail': r're:https?://.+\.jpg$', + 'season': 'Season 2', + 'chapters': 'count:4', + 'episode': 'Episode 81', + 'season_number': 2, + 'series': 'Tooning Out The News', }, 'params': { 'skip_download': 'm3u8', }, + 'expected_warnings': ['Ignoring subtitle tracks'], }, { 'url': 'https://www.paramountplus.com/movies/video/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC/', 'info_dict': { @@ -54,14 +71,18 @@ class ParamountPlusIE(CBSBaseIE): 'ext': 'mp4', 'title': 'Daddy\'s Home', 'upload_date': '20151225', - 'description': 'md5:a0beaf24e8d3b0e81b2ee41d47c06f33', + 'description': 'md5:9a6300c504d5e12000e8707f20c54745', 'uploader': 'CBSI-NEW', 'timestamp': 1451030400, + 'thumbnail': r're:https?://.+\.jpg$', + 'chapters': 'count:0', + 'duration': 5761, + 'series': 'Paramount+ Movies', }, 'params': { 'skip_download': 'm3u8', }, - 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this + 'skip': 'DRM', }, { 'url': 'https://www.paramountplus.com/movies/video/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc/', 'info_dict': { @@ -72,11 +93,15 @@ class ParamountPlusIE(CBSBaseIE): 'timestamp': 1577865600, 'title': 'Sonic the Hedgehog', 'upload_date': '20200101', + 'thumbnail': r're:https?://.+\.jpg$', + 'chapters': 'count:0', + 'duration': 5932, + 'series': 'Paramount+ Movies', }, 'params': { 'skip_download': 'm3u8', }, - 'expected_warnings': ['Ignoring subtitle tracks'], + 'skip': 'DRM', }, { 'url': 'https://www.paramountplus.com/shows/the-real-world/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/the-real-world-reunion/', 'only_matching': True, @@ -99,18 +124,42 @@ def _extract_video_info(self, content_id, mpx_acc=2198311517): asset_types = { item.get('assetType'): { 'format': 'SMIL', - 'formats': 'MPEG4,M3U', + 'formats': 'M3U+none,MPEG4', # '+none' specifies ProtectionScheme (no DRM) } for item in items_data['itemList'] } item = items_data['itemList'][-1] - return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={ + + info, error = {}, None + metadata = { 'title': item.get('title'), 'series': item.get('seriesTitle'), 'season_number': int_or_none(item.get('seasonNum')), 'episode_number': int_or_none(item.get('episodeNum')), 'duration': int_or_none(item.get('duration')), 'thumbnail': url_or_none(item.get('thumbnail')), - }) + } + try: + info = self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info=metadata) + except ExtractorError as e: + error = e + + # Check for DRM formats to give appropriate error + if not info.get('formats'): + for query in asset_types.values(): + query['formats'] = 'MPEG-DASH,M3U,MPEG4' # allows DRM formats + + try: + drm_info = self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info=metadata) + except ExtractorError: + if error: + raise error from None + raise + if drm_info['formats']: + self.report_drm(content_id) + elif error: + raise error + + return info class ParamountPlusSeriesIE(InfoExtractor): From d3a3d7f0cc27ca78aeb807b27c7ebee88ff3161e Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Tue, 4 Oct 2022 08:37:48 +1300 Subject: [PATCH 218/284] [extractor/JWPlatform] Fix extractor (#5112) Fix bitrate and filesize extraction and support embeds with unquoted urls. Related: #5106 Authored by: coletdjnz --- yt_dlp/extractor/common.py | 3 ++- yt_dlp/extractor/generic.py | 12 ------------ yt_dlp/extractor/jwplatform.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index caec0ccf62..0700b4767b 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3587,7 +3587,8 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } if source_url.startswith('rtmp'): diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 73aefc7829..73422f937c 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1071,18 +1071,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - { - # JWPlatform iframe - 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', - 'info_dict': { - 'id': 'AG26UQXM', - 'ext': 'mp4', - 'upload_date': '20160719', - 'timestamp': 468923808, - 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', - }, - 'add_ie': ['JWPlatform'], - }, { # Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py index d6b8420a87..c949689430 100644 --- a/yt_dlp/extractor/jwplatform.py +++ b/yt_dlp/extractor/jwplatform.py @@ -22,13 +22,42 @@ class JWPlatformIE(InfoExtractor): 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + # JWPlatform iframe + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', + 'info_dict': { + 'id': 'AG26UQXM', + 'ext': 'mp4', + 'upload_date': '20160719', + 'timestamp': 1468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/AG26UQXM/poster.jpg?width=720', + 'description': '', + 'duration': 294.0, + }, + }, { + # Player url not surrounded by quotes + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'info_dict': { + 'id': 'R10NQdhY', + 'title': 'Playgirl', + 'ext': 'mp4', + 'upload_date': '20220624', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', + 'timestamp': 1656064800, + 'description': 'BRD 1966, Will Tremper', + 'duration': 5146.0, + }, + 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }] + @classmethod def _extract_embed_urls(cls, url, webpage): for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): # <input value=URL> is used by hyland.com # if we find <iframe>, dont look for <input> ret = re.findall( - r'<%s[^>]+?%s=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), + r'<%s[^>]+?%s=["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), webpage) if ret: return ret From 7474e4531e5911b04030ee52ff93ca4f2527490d Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Tue, 4 Oct 2022 08:40:49 +1300 Subject: [PATCH 219/284] [extractor/AmazonStore] Fix JSON extraction (#5111) Fixes https://github.com/yt-dlp/yt-dlp/issues/5110 Authored by: coletdjnz Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> --- yt_dlp/extractor/amazon.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py index 9e9e9772da..4d3170683a 100644 --- a/yt_dlp/extractor/amazon.py +++ b/yt_dlp/extractor/amazon.py @@ -9,7 +9,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', }, 'playlist_mincount': 1, 'playlist': [{ @@ -18,22 +18,30 @@ class AmazonStoreIE(InfoExtractor): 'ext': 'mp4', 'title': 'mcdodo usb c cable 100W 5a', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 34, }, }] }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + 'title': 'md5:d1d3352428f8f015706c84b31e132169', }, 'playlist_mincount': 4, }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', }, 'playlist-mincount': 1, + }, { + 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', + 'info_dict': { + 'id': 'B08WX337PQ', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + }, + 'playlist_mincount': 1, }] def _real_extract(self, url): @@ -42,7 +50,9 @@ def _real_extract(self, url): for retry in self.RetryManager(): webpage = self._download_webpage(url, id) try: - data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + data_json = self._search_json( + r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, + transform_source=lambda x: x.replace(R'\\u', R'\u')) except ExtractorError as e: retry.error = e @@ -55,4 +65,4 @@ def _real_extract(self, url): 'height': int_or_none(video.get('videoHeight')), 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] - return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) From a057779d5e706f7bb8721a6c46cca47f0925f682 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 4 Oct 2022 01:34:04 +0530 Subject: [PATCH 220/284] [cleanup] Minor fixes Closes #5129, Closes #4982 --- Makefile | 4 ++-- yt_dlp/YoutubeDL.py | 5 +++++ yt_dlp/downloader/common.py | 19 +++++++++++-------- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/spotify.py | 1 + yt_dlp/extractor/youtube.py | 7 ++++--- 6 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 19a377002b..3b97c74079 100644 --- a/Makefile +++ b/Makefile @@ -81,9 +81,9 @@ yt-dlp: yt_dlp/*.py yt_dlp/*/*.py mkdir -p zip/$$d ;\ cp -pPR $$d/*.py zip/$$d/ ;\ done - touch -t 200001010101 zip/yt_dlp/*.py zip/yt_dlp/*/*.py zip/yt_dlp/*/*/*.py + touch -t 200001010101 zip/yt_dlp/*.py zip/yt_dlp/*/*.py mv zip/yt_dlp/__main__.py zip/ - cd zip ; zip -q ../yt-dlp yt_dlp/*.py yt_dlp/*/*.py yt_dlp/*/*/*.py __main__.py + cd zip ; zip -q ../yt-dlp yt_dlp/*.py yt_dlp/*/*.py __main__.py rm -rf zip echo '#!$(PYTHON)' > yt-dlp cat yt-dlp.zip >> yt-dlp diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index bc6de49267..53681149e1 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2426,6 +2426,8 @@ def _fill_common_fields(self, info_dict, is_video=True): for key in live_keys: if info_dict.get(key) is None: info_dict[key] = (live_status == key) + if live_status == 'post_live': + info_dict['was_live'] = True # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. @@ -3683,6 +3685,8 @@ def print_debug_header(self): if not self.params.get('verbose'): return + from . import _IN_CLI # Must be delayed import + # These imports can be slow. So import them only as needed from .extractor.extractors import _LAZY_LOADER from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors @@ -3719,6 +3723,7 @@ def get_encoding(stream): __version__, f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', + '' if _IN_CLI else 'API', delim=' ')) if not _LAZY_LOADER: if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index ab557a47ac..221b3827c7 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -24,6 +24,7 @@ encodeFilename, format_bytes, join_nonempty, + remove_start, sanitize_open, shell_quote, timeconvert, @@ -120,11 +121,11 @@ def format_seconds(seconds): time = timetuple_from_msec(seconds * 1000) if time.hours > 99: return '--:--:--' - if not time.hours: - return ' %02d:%02d' % time[1:-1] return '%02d:%02d:%02d' % time[:-1] - format_eta = format_seconds + @classmethod + def format_eta(cls, seconds): + return f'{remove_start(cls.format_seconds(seconds), "00:"):>8s}' @staticmethod def calc_percent(byte_counter, data_len): @@ -332,6 +333,8 @@ def with_fields(*tups, default=''): return tmpl return default + _formats_bytes = lambda k: f'{format_bytes(s.get(k)):>10s}' + if s['status'] == 'finished': if self.params.get('noprogress'): self.to_screen('[download] Download completed') @@ -339,7 +342,7 @@ def with_fields(*tups, default=''): s.update({ 'speed': speed, '_speed_str': self.format_speed(speed).strip(), - '_total_bytes_str': format_bytes(s.get('total_bytes')), + '_total_bytes_str': _formats_bytes('total_bytes'), '_elapsed_str': self.format_seconds(s.get('elapsed')), '_percent_str': self.format_percent(100), }) @@ -354,15 +357,15 @@ def with_fields(*tups, default=''): return s.update({ - '_eta_str': self.format_eta(s.get('eta')), + '_eta_str': self.format_eta(s.get('eta')).strip(), '_speed_str': self.format_speed(s.get('speed')), '_percent_str': self.format_percent(try_call( lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], lambda: s['downloaded_bytes'] == 0 and 0)), - '_total_bytes_str': format_bytes(s.get('total_bytes')), - '_total_bytes_estimate_str': format_bytes(s.get('total_bytes_estimate')), - '_downloaded_bytes_str': format_bytes(s.get('downloaded_bytes')), + '_total_bytes_str': _formats_bytes('total_bytes'), + '_total_bytes_estimate_str': _formats_bytes('total_bytes_estimate'), + '_downloaded_bytes_str': _formats_bytes('downloaded_bytes'), '_elapsed_str': self.format_seconds(s.get('elapsed')), }) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 0700b4767b..944b196a11 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1862,7 +1862,7 @@ def add_item(field, reverse, closest, limit_text): alias, field = field, self._get_field_setting(field, 'field') if self._get_field_setting(alias, 'deprecated'): self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may ' - 'be removed in a future version. Please use {field} instead') + f'be removed in a future version. Please use {field} instead') reverse = match.group('reverse') is not None closest = match.group('separator') == '~' limit_text = match.group('limit') diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py index 4da24db9e9..55ce36aeaa 100644 --- a/yt_dlp/extractor/spotify.py +++ b/yt_dlp/extractor/spotify.py @@ -16,6 +16,7 @@ class SpotifyBaseIE(InfoExtractor): + _WORKING = False _ACCESS_TOKEN = None _OPERATION_HASHES = { 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index f73465ba4c..6047f2864a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -390,6 +390,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko' ] + _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + @functools.cached_property def _preferred_lang(self): """ @@ -692,12 +694,11 @@ def _extract_alerts(cls, data): yield alert_type, message def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): - errors = [] - warnings = [] + errors, warnings = [], [] for alert_type, alert_message in alerts: if alert_type.lower() == 'error' and fatal: errors.append([alert_type, alert_message]) - else: + elif alert_message not in self._IGNORED_WARNINGS: warnings.append([alert_type, alert_message]) for alert_type, alert_message in (warnings + errors[:-1]): From 1d77d8ce07d21850cac2be6fcffea3311234bc16 Mon Sep 17 00:00:00 2001 From: Livia Medeiros <livia@cirno.name> Date: Tue, 4 Oct 2022 06:01:53 +0900 Subject: [PATCH 221/284] [extractor/holodex] Fix `_VALID_URL` (#4948) Authored by: LiviaMedeiros --- yt_dlp/extractor/holodex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/holodex.py b/yt_dlp/extractor/holodex.py index 70d711719e..a2b73ecc1c 100644 --- a/yt_dlp/extractor/holodex.py +++ b/yt_dlp/extractor/holodex.py @@ -6,7 +6,7 @@ class HolodexIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.|staging\.)?holodex\.net/(?: api/v2/playlist/(?P<playlist>\d+)| - watch/(?P<id>\w+)(?:\?(?:[^#]+&)?playlist=(?P<playlist2>\d+))? + watch/(?P<id>[\w-]{11})(?:\?(?:[^#]+&)?playlist=(?P<playlist2>\d+))? )''' _TESTS = [{ 'url': 'https://holodex.net/watch/9kQ2GtvDV3s', From dd4411aac2ef72edb170efb38d19b13b82271cc4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 3 Oct 2022 21:04:39 +0000 Subject: [PATCH 222/284] [extractor/nfl] Fix extractor (#5130) Closes #1708 Authored by: bashonly --- yt_dlp/extractor/nfl.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index e5810b3464..1065666117 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -53,8 +53,7 @@ class NFLBaseIE(InfoExtractor): ) )/ ''' - _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' - _WORKING = False + _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+});?\s*</script>' def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) @@ -66,7 +65,7 @@ def _parse_video_config(self, video_config, display_id): 'Anvato', mcp_id) else: media_id = item.get('id') or item['entityId'] - title = item['title'] + title = item.get('title') item_url = item['url'] info = {'id': media_id} ext = determine_ext(item_url) @@ -108,6 +107,9 @@ class NFLIE(NFLBaseIE): 'timestamp': 1608009755, 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'NFL', + 'tags': 'count:6', + 'duration': 157, + 'categories': 'count:3', } }, { 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', @@ -117,7 +119,8 @@ class NFLIE(NFLBaseIE): 'ext': 'mp3', 'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown', 'description': 'md5:12ada8ee70e6762658c30e223e095075', - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14', 'only_matching': True, From 4d37720a0c5f1c9c4768ea20b0f943277f55bc12 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Tue, 4 Oct 2022 11:48:31 +0900 Subject: [PATCH 223/284] [extractor/youtube] Download `post_live` videos from start (#5091) * The fragments are generated as a `LazyList`. So only the required formats are expanded during download, but all fragment lists are printed/written in infojson. * The m3u8 formats which cannot be downloaded from start are not extracted by default, but can be enabled with an extractor-arg. The extractor-arg `include_live_dash` is renamed to `include_incomplete_formats` to account for this new use-case. Closes #1564 Authored by: Lesmiscore, pukkandan --- README.md | 2 +- yt_dlp/extractor/youtube.py | 159 ++++++++++++++++++++++-------------- 2 files changed, 98 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 76c73398e3..8f93ba415a 100644 --- a/README.md +++ b/README.md @@ -1704,7 +1704,7 @@ #### youtube * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) +* `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6047f2864a..4456110f6c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -24,6 +24,7 @@ from ..utils import ( NO_DEFAULT, ExtractorError, + LazyList, UserNotLive, bug_reports_message, classproperty, @@ -2493,10 +2494,8 @@ def __init__(self, *args, **kwargs): self._code_cache = {} self._player_cache = {} - def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): + def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live): lock = threading.Lock() - - is_live = True start_time = time.time() formats = [f for f in formats if f.get('is_from_start')] @@ -2511,7 +2510,8 @@ def refetch_manifest(format_id, delay): microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) - _, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) + _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) + is_live = live_status == 'is_live' start_time = time.time() def mpd_feed(format_id, delay): @@ -2532,12 +2532,17 @@ def mpd_feed(format_id, delay): return f['manifest_url'], f['manifest_stream_number'], is_live for f in formats: - f['is_live'] = True - f['protocol'] = 'http_dash_segments_generator' - f['fragments'] = functools.partial( - self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) + f['is_live'] = is_live + gen = functools.partial(self._live_dash_fragments, video_id, f['format_id'], + live_start_time, mpd_feed, not is_live and f.copy()) + if is_live: + f['fragments'] = gen + f['protocol'] = 'http_dash_segments_generator' + else: + f['fragments'] = LazyList(gen({})) + del f['is_from_start'] - def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx): + def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, manifestless_orig_fmt, ctx): FETCH_SPAN, MAX_DURATION = 5, 432000 mpd_url, stream_number, is_live = None, None, True @@ -2568,15 +2573,18 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): return False, last_seq elif old_mpd_url == mpd_url: return True, last_seq - try: - fmts, _ = self._extract_mpd_formats_and_subtitles( - mpd_url, None, note=False, errnote=False, fatal=False) - except ExtractorError: - fmts = None - if not fmts: - no_fragment_score += 2 - return False, last_seq - fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) + if manifestless_orig_fmt: + fmt_info = manifestless_orig_fmt + else: + try: + fmts, _ = self._extract_mpd_formats_and_subtitles( + mpd_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + fmts = None + if not fmts: + no_fragment_score += 2 + return False, last_seq + fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) fragments = fmt_info['fragments'] fragment_base_url = fmt_info['fragment_base_url'] assert fragment_base_url @@ -2584,6 +2592,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) return True, _last_seq + self.write_debug(f'[{video_id}] Generating fragments for format {format_id}') while is_live: fetch_time = time.time() if no_fragment_score > 30: @@ -2637,6 +2646,11 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): except ExtractorError: continue + if manifestless_orig_fmt: + # Stop at the first iteration if running for post-live manifestless; + # fragment count no longer increase since it starts + break + time.sleep(max(0, FETCH_SPAN + fetch_time - time.time())) def _extract_player_url(self, *ytcfgs, webpage=None): @@ -3397,7 +3411,12 @@ def append_client(*client_names): self.report_warning(last_error) return prs, player_url - def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration): + def _needs_live_processing(self, live_status, duration): + if (live_status == 'is_live' and self.get_param('live_from_start') + or live_status == 'post_live' and (duration or 0) > 4 * 3600): + return live_status + + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): itags, stream_ids = {}, [] itag_qualities, res_qualities = {}, {0: None} q = qualities([ @@ -3544,15 +3563,22 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, i dct['container'] = dct['ext'] + '_dash' yield dct - live_from_start = is_live and self.get_param('live_from_start') - skip_manifests = self._configuration_arg('skip') - if not self.get_param('youtube_include_hls_manifest', True): - skip_manifests.append('hls') + needs_live_processing = self._needs_live_processing(live_status, duration) + skip_bad_formats = not self._configuration_arg('include_incomplete_formats') + + skip_manifests = set(self._configuration_arg('skip')) + if (not self.get_param('youtube_include_hls_manifest', True) + or needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway + or needs_live_processing and skip_bad_formats): + skip_manifests.add('hls') + if not self.get_param('youtube_include_dash_manifest', True): - skip_manifests.append('dash') - get_dash = 'dash' not in skip_manifests and ( - not is_live or live_from_start or self._configuration_arg('include_live_dash')) - get_hls = not live_from_start and 'hls' not in skip_manifests + skip_manifests.add('dash') + if self._configuration_arg('include_live_dash'): + self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. ' + 'Use include_incomplete_formats extractor argument instead') + elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': + skip_manifests.add('dash') def process_manifest_format(f, proto, itag): if itag in itags: @@ -3570,16 +3596,17 @@ def process_manifest_format(f, proto, itag): subtitles = {} for sd in streaming_data: - hls_manifest_url = get_hls and sd.get('hlsManifestUrl') + hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: - fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', self._search_regex( r'/itag/(\d+)', f['url'], 'itag', default=None)): yield f - dash_manifest_url = get_dash and sd.get('dashManifestUrl') + dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH @@ -3587,7 +3614,7 @@ def process_manifest_format(f, proto, itag): if process_manifest_format(f, 'dash', f['format_id']): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) - if live_from_start: + if needs_live_processing: f['is_from_start'] = True yield f @@ -3653,11 +3680,23 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, is_live = get_first(video_details, 'isLive') if is_live is None: is_live = get_first(live_broadcast_details, 'isLiveNow') + live_content = get_first(video_details, 'isLiveContent') + is_upcoming = get_first(video_details, 'isUpcoming') + if is_live is None and is_upcoming or live_content is False: + is_live = False + if is_upcoming is None and (live_content or is_live): + is_upcoming = False + post_live = get_first(video_details, 'isPostLiveDvr') + live_status = ('post_live' if post_live + else 'is_live' if is_live + else 'is_upcoming' if is_upcoming + else None if None in (is_live, is_upcoming, live_content) + else 'was_live' if live_content else 'not_live') streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) - *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live, duration) + *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) - return live_broadcast_details, is_live, streaming_data, formats, subtitles + return live_broadcast_details, live_status, streaming_data, formats, subtitles def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -3749,8 +3788,10 @@ def feed_entry(name): or get_first(microformats, 'lengthSeconds') or parse_duration(search_meta('duration'))) or None - live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \ - self._list_formats(video_id, microformats, video_details, player_responses, player_url) + live_broadcast_details, live_status, streaming_data, formats, automatic_captions = \ + self._list_formats(video_id, microformats, video_details, player_responses, player_url, duration) + if live_status == 'post_live': + self.write_debug(f'{video_id}: Video is in Post-Live Manifestless mode') if not formats: if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): @@ -3809,7 +3850,7 @@ def feed_entry(name): thumbnails.extend({ 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( video_id=video_id, name=name, ext=ext, - webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''), + webp='_webp' if ext == 'webp' else '', live='_live' if live_status == 'is_live' else ''), } for name in thumbnail_names for ext in ('webp', 'jpg')) for thumb in thumbnails: i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) @@ -3824,20 +3865,27 @@ def feed_entry(name): or search_meta('channelId')) owner_profile_url = get_first(microformats, 'ownerProfileUrl') - live_content = get_first(video_details, 'isLiveContent') - is_upcoming = get_first(video_details, 'isUpcoming') - if is_live is None: - if is_upcoming or live_content is False: - is_live = False - if is_upcoming is None and (live_content or is_live): - is_upcoming = False live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) if not duration and live_end_time and live_start_time: duration = live_end_time - live_start_time - if is_live and self.get_param('live_from_start'): - self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data) + needs_live_processing = self._needs_live_processing(live_status, duration) + + def is_bad_format(fmt): + if needs_live_processing and not fmt.get('is_from_start'): + return True + elif (live_status == 'is_live' and needs_live_processing != 'is_live' + and fmt.get('protocol') == 'http_dash_segments'): + return True + + for fmt in filter(is_bad_format, formats): + fmt['preference'] = (fmt.get('preference') or -1) - 10 + fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ') + + if needs_live_processing: + self._prepare_live_from_start_formats( + formats, video_id, live_start_time, url, webpage_url, smuggled_data, live_status == 'is_live') formats.extend(self._extract_storyboard(player_responses, duration)) @@ -3872,22 +3920,10 @@ def feed_entry(name): 'categories': [category] if category else None, 'tags': keywords, 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), - 'is_live': is_live, - 'was_live': (False if is_live or is_upcoming or live_content is False - else None if is_live is None or is_upcoming is None - else live_content), - 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL + 'live_status': live_status, 'release_timestamp': live_start_time, } - if get_first(video_details, 'isPostLiveDvr'): - self.write_debug('Video is in Post-Live Manifestless mode') - info['live_status'] = 'post_live' - if (duration or 0) > 4 * 3600: - self.report_warning( - 'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. ' - 'This is a known issue and patches are welcome') - subtitles = {} pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) if pctr: @@ -4017,7 +4053,8 @@ def process_language(container, base_url, lang_code, sub_name, query): 'url': f'https://www.youtube.com/watch?v={video_id}&bpctr=9999999999&has_verified=1', 'video_id': video_id, 'ext': 'json', - 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay', + 'protocol': ('youtube_live_chat' if live_status in ('is_live', 'is_upcoming') + else 'youtube_live_chat_replay'), }] if initial_data: @@ -4124,9 +4161,7 @@ def process_language(container, base_url, lang_code, sub_name, query): unified_strdate(get_first(microformats, 'uploadDate')) or unified_strdate(search_meta('uploadDate'))) if not upload_date or ( - not info.get('is_live') - and not info.get('was_live') - and info.get('live_status') != 'is_upcoming' + live_status in ('not_live', None) and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) ): upload_date = strftime_or_none( From 0d887f273a0aa28e7aea3780663b7faca44440b6 Mon Sep 17 00:00:00 2001 From: Bobscorn <qwertster0@gmail.com> Date: Tue, 4 Oct 2022 15:51:54 +1300 Subject: [PATCH 224/284] [extractor/IsraelNationalNews] Add extractor (#5089) Closes #4019 Authored by: Bobscorn --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/israelnationalnews.py | 50 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 yt_dlp/extractor/israelnationalnews.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3a92c1d028..42f7658192 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -755,6 +755,7 @@ IslamChannelIE, IslamChannelSeriesIE, ) +from .israelnationalnews import IsraelNationalNewsIE from .itprotv import ( ITProTVIE, ITProTVCourseIE diff --git a/yt_dlp/extractor/israelnationalnews.py b/yt_dlp/extractor/israelnationalnews.py new file mode 100644 index 0000000000..35040f576a --- /dev/null +++ b/yt_dlp/extractor/israelnationalnews.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, traverse_obj + + +class IsraelNationalNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?israelnationalnews\.com/news/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.israelnationalnews.com/news/354520', + 'info_dict': { + 'id': '354520' + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jA84wQhVvg8', + 'title': 'Even CNN Host Is Shocked by How Bad Biden\'s Approval Ratings Have Gotten | DM CLIPS | Rubin Report', + 'ext': 'mp4', + 'description': 'md5:b7325a3d00c7596337dc3ae37e32d35c', + 'channel': 'The Rubin Report', + 'channel_follower_count': int, + 'comment_count': int, + 'categories': ['News & Politics'], + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/RubinReport', + 'uploader_id': 'RubinReport', + 'availability': 'public', + 'view_count': int, + 'duration': 240, + 'thumbnail': 'https://i.ytimg.com/vi_webp/jA84wQhVvg8/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'age_limit': 0, + 'tags': 'count:29', + 'channel_id': 'UCJdKr0Bgd_5saZYqLCa9mng', + 'channel_url': 'https://www.youtube.com/channel/UCJdKr0Bgd_5saZYqLCa9mng', + 'upload_date': '20220606', + 'uploader': 'The Rubin Report', + } + }] + }] + + def _real_extract(self, url): + news_article_id = self._match_id(url) + article_json = self._download_json( + f'https://www.israelnationalnews.com/Generic/NewAPI/Item?type=0&Item={news_article_id}', news_article_id) + + urls = traverse_obj(article_json, ('Content2', ..., 'content', ..., 'attrs', 'src')) + if not urls: + raise ExtractorError('This article does not have any videos', expected=True) + + return self.playlist_from_matches(urls, news_article_id, ie='Youtube') From 12f153a8275bd4c05aee1532b3eb00f1361c4636 Mon Sep 17 00:00:00 2001 From: Locke <hamannsun@gmail.com> Date: Tue, 4 Oct 2022 10:59:05 +0800 Subject: [PATCH 225/284] [extractor/BilibiliSpace] Fix extractor, better error message (#5043) Closes #5038 Authored by: lockmatrix --- yt_dlp/extractor/bilibili.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 2e03aee856..5a5c79f296 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -4,6 +4,7 @@ import functools import math import re +import urllib from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -508,11 +509,11 @@ def _real_extract(self, url): class BilibiliSpaceBaseIE(InfoExtractor): def _extract_playlist(self, fetch_page, get_metadata, get_entries): - first_page = fetch_page(1) + first_page = fetch_page(0) metadata = get_metadata(first_page) paged_list = InAdvancePagedList( - lambda idx: get_entries(fetch_page(idx) if idx > 1 else first_page), + lambda idx: get_entries(fetch_page(idx) if idx else first_page), metadata['page_count'], metadata['page_size']) return metadata, paged_list @@ -535,10 +536,19 @@ def _real_extract(self, url): 'To download audios, add a "/audio" to the URL') def fetch_page(page_idx): - return self._download_json( - 'https://api.bilibili.com/x/space/arc/search', playlist_id, - note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx, 'jsonp': 'jsonp'})['data'] + try: + response = self._download_json('https://api.bilibili.com/x/space/arc/search', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + raise ExtractorError( + 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) + raise + if response['code'] == -401: + raise ExtractorError( + 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) + return response['data'] def get_metadata(page_data): page_size = page_data['page']['ps'] @@ -573,7 +583,7 @@ def fetch_page(page_idx): return self._download_json( 'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id, note=f'Downloading page {page_idx}', - query={'uid': playlist_id, 'pn': page_idx, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data'] + query={'uid': playlist_id, 'pn': page_idx + 1, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data'] def get_metadata(page_data): return { @@ -608,7 +618,7 @@ def fetch_page(page_idx): return self._download_json( 'https://api.bilibili.com/x/polymer/space/seasons_archives_list', playlist_id, note=f'Downloading page {page_idx}', - query={'mid': mid, 'season_id': sid, 'page_num': page_idx, 'page_size': 30})['data'] + query={'mid': mid, 'season_id': sid, 'page_num': page_idx + 1, 'page_size': 30})['data'] def get_metadata(page_data): page_size = page_data['page']['page_size'] From c7f540ea1eab69c47ba2a758f9c79297b721cb70 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Tue, 4 Oct 2022 12:09:23 +0900 Subject: [PATCH 226/284] [extractor/detik] Generalize extractors (#4899) Authored by: HobbyistDev, coletdjnz --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/cnn.py | 57 ++++++++- yt_dlp/extractor/detik.py | 216 +++++++++++++++++++------------- 3 files changed, 186 insertions(+), 90 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42f7658192..8e9cfd8fb2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -333,6 +333,7 @@ CNNIE, CNNBlogsIE, CNNArticleIE, + CNNIndonesiaIE, ) from .coub import CoubIE from .comedycentral import ( @@ -411,7 +412,7 @@ DeezerAlbumIE, ) from .democracynow import DemocracynowIE -from .detik import Detik20IE +from .detik import DetikEmbedIE from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE diff --git a/yt_dlp/extractor/cnn.py b/yt_dlp/extractor/cnn.py index 96482eaf58..61b62fae9f 100644 --- a/yt_dlp/extractor/cnn.py +++ b/yt_dlp/extractor/cnn.py @@ -1,6 +1,6 @@ from .common import InfoExtractor from .turner import TurnerBaseIE -from ..utils import url_basename +from ..utils import merge_dicts, try_call, url_basename class CNNIE(TurnerBaseIE): @@ -141,3 +141,58 @@ def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) + + +class CNNIndonesiaIE(InfoExtractor): + _VALID_URL = r'https?://www\.cnnindonesia\.com/[\w-]+/(?P<upload_date>\d{8})\d+-\d+-(?P<id>\d+)/(?P<display_id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.cnnindonesia.com/ekonomi/20220909212635-89-845885/alasan-harga-bbm-di-indonesia-masih-disubsidi', + 'info_dict': { + 'id': '845885', + 'ext': 'mp4', + 'description': 'md5:e7954bfa6f1749bc9ef0c079a719c347', + 'upload_date': '20220909', + 'title': 'Alasan Harga BBM di Indonesia Masih Disubsidi', + 'timestamp': 1662859088, + 'duration': 120.0, + 'thumbnail': r're:https://akcdn\.detik\.net\.id/visual/2022/09/09/thumbnail-ekopedia-alasan-harga-bbm-disubsidi_169\.jpeg', + 'tags': ['ekopedia', 'subsidi bbm', 'subsidi', 'bbm', 'bbm subsidi', 'harga pertalite naik'], + 'age_limit': 0, + 'release_timestamp': 1662859088, + 'release_date': '20220911', + 'uploader': 'Asfahan Yahsyi', + } + }, { + 'url': 'https://www.cnnindonesia.com/internasional/20220911104341-139-846189/video-momen-charles-disambut-meriah-usai-dilantik-jadi-raja-inggris', + 'info_dict': { + 'id': '846189', + 'ext': 'mp4', + 'upload_date': '20220911', + 'duration': 76.0, + 'timestamp': 1662869995, + 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d', + 'thumbnail': r're:https://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169\.jpeg', + 'title': 'VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris', + 'tags': ['raja charles', 'raja charles iii', 'ratu elizabeth', 'ratu elizabeth meninggal dunia', 'raja inggris', 'inggris'], + 'age_limit': 0, + 'release_date': '20220911', + 'uploader': 'REUTERS', + 'release_timestamp': 1662869995, + } + }] + + def _real_extract(self, url): + upload_date, video_id, display_id = self._match_valid_url(url).group('upload_date', 'id', 'display_id') + webpage = self._download_webpage(url, display_id) + + json_ld_list = list(self._yield_json_ld(webpage, display_id)) + json_ld_data = self._json_ld(json_ld_list, display_id) + embed_url = next( + json_ld.get('embedUrl') for json_ld in json_ld_list if json_ld.get('@type') == 'VideoObject') + + return merge_dicts(json_ld_data, { + '_type': 'url_transparent', + 'url': embed_url, + 'upload_date': upload_date, + 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')) + }) diff --git a/yt_dlp/extractor/detik.py b/yt_dlp/extractor/detik.py index e2637d3f3f..7ee6f2746a 100644 --- a/yt_dlp/extractor/detik.py +++ b/yt_dlp/extractor/detik.py @@ -1,122 +1,162 @@ from .common import InfoExtractor -from ..utils import merge_dicts, str_or_none +from ..utils import int_or_none, merge_dicts, try_call, url_basename -class Detik20IE(InfoExtractor): - IE_NAME = '20.detik.com' - _VALID_URL = r'https?://20\.detik\.com/((?!program)[\w-]+)/[\d-]+/(?P<id>[\w-]+)' - _TESTS = [{ - # detikflash - 'url': 'https://20.detik.com/detikflash/20220705-220705098/zulhas-klaim-sukses-turunkan-harga-migor-jawa-bali', +class DetikEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + # cnn embed + 'url': 'https://www.cnnindonesia.com/embed/video/846189', 'info_dict': { - 'id': '220705098', + 'id': '846189', 'ext': 'mp4', - 'duration': 157, - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/05/bfe0384db04f4bbb9dd5efc869c5d4b1-20220705164334-0s.jpg?w=650&q=80', - 'description': 'md5:ac18dcee5b107abbec1ed46e0bf400e3', - 'title': 'Zulhas Klaim Sukses Turunkan Harga Migor Jawa-Bali', - 'tags': ['zulkifli hasan', 'menteri perdagangan', 'minyak goreng'], - 'timestamp': 1657039548, - 'upload_date': '20220705' + 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d', + 'thumbnail': r're:https?://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169.jpeg', + 'title': 'Video CNN Indonesia - VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris', + 'age_limit': 0, + 'tags': ['raja charles', ' raja charles iii', ' ratu elizabeth', ' ratu elizabeth meninggal dunia', ' raja inggris', ' inggris'], + 'release_timestamp': 1662869995, + 'release_date': '20220911', + 'uploader': 'REUTERS' } }, { - # e-flash - 'url': 'https://20.detik.com/e-flash/20220705-220705109/ahli-level-ppkm-jadi-payung-strategi-protokol-kesehatan', - 'info_dict': { - 'id': '220705109', - 'ext': 'mp4', - 'tags': ['ppkm jabodetabek', 'dicky budiman', 'ppkm'], - 'upload_date': '20220705', - 'duration': 110, - 'title': 'Ahli: Level PPKM Jadi Payung Strategi Protokol Kesehatan', - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/05/Ahli-_Level_PPKM_Jadi_Payung_Strat_jOgUMCN-20220705182313-custom.jpg?w=650&q=80', - 'description': 'md5:4eb825a9842e6bdfefd66f47b364314a', - 'timestamp': 1657045255, - } - }, { - # otobuzz + # 20.detik 'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', 'info_dict': { + 'display_id': 'mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', 'id': '220704093', 'ext': 'mp4', - 'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'], - 'timestamp': 1656951521, - 'duration': 83, - 'upload_date': '20220704', - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg?w=650&q=80', 'description': 'md5:9b2257341b6f375cdcf90106146d5ffb', + 'thumbnail': r're:https?://cdnv\.detik\.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg', 'title': 'Mulai Rp 10 Jutaan! Ini Skema Kredit Mitsubishi Pajero Sport', - } - }, { - # sport-buzz - 'url': 'https://20.detik.com/sport-buzz/20220704-220704054/crash-crash-horor-di-paruh-pertama-motogp-2022', - 'info_dict': { - 'id': '220704054', - 'ext': 'mp4', - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/04/6b172c6fb564411996ea145128315630-20220704090746-0s.jpg?w=650&q=80', - 'title': 'Crash-crash Horor di Paruh Pertama MotoGP 2022', - 'description': 'md5:fbcc6687572ad7d16eb521b76daa50e4', - 'timestamp': 1656925591, - 'duration': 107, - 'tags': ['marc marquez', 'fabio quartararo', 'francesco bagnaia', 'motogp crash', 'motogp 2022'], + 'timestamp': 1656951521, 'upload_date': '20220704', + 'duration': 83.0, + 'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'], + 'release_timestamp': 1656926321, + 'release_date': '20220704', + 'age_limit': 0, + 'uploader': 'Ridwan Arifin ' # TODO: strip trailling whitespace at uploader } }, { - # adu-perspektif - 'url': 'https://20.detik.com/adu-perspektif/20220518-220518144/24-tahun-reformasi-dan-alarm-demokrasi-dari-filipina', + # pasangmata.detik + 'url': 'https://pasangmata.detik.com/contribution/366649', 'info_dict': { - 'id': '220518144', + 'id': '366649', 'ext': 'mp4', - 'title': '24 Tahun Reformasi dan Alarm Demokrasi dari Filipina', - 'upload_date': '20220518', - 'timestamp': 1652913823, - 'duration': 185.0, - 'tags': ['politik', 'adu perspektif', 'indonesia', 'filipina', 'demokrasi'], - 'description': 'md5:8eaaf440b839c3d02dca8c9bbbb099a9', - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/05/18/adpers_18_mei_compressed-20220518230458-custom.jpg?w=650&q=80', + 'title': 'Saling Dorong Aparat dan Pendemo di Aksi Tolak Kenaikan BBM', + 'description': 'md5:7a6580876c8381c454679e028620bea7', + 'age_limit': 0, + 'tags': 'count:17', + 'thumbnail': 'https://akcdn.detik.net.id/community/data/media/thumbs-pasangmata/2022/09/08/366649-16626229351533009620.mp4-03.jpg', } }, { - # sosok - 'url': 'https://20.detik.com/sosok/20220702-220703032/resa-boenard-si-princess-bantar-gebang', + # insertlive embed + 'url': 'https://www.insertlive.com/embed/video/290482', 'info_dict': { - 'id': '220703032', + 'id': '290482', 'ext': 'mp4', - 'timestamp': 1656824438, - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/02/SOSOK_BGBJ-20220702191138-custom.jpg?w=650&q=80', - 'title': 'Resa Boenard Si \'Princess Bantar Gebang\'', - 'description': 'md5:84ea66306a0285330de6a13fc6218b78', - 'tags': ['sosok', 'sosok20d', 'bantar gebang', 'bgbj', 'resa boenard', 'bantar gebang bgbj', 'bgbj bantar gebang', 'sosok bantar gebang', 'sosok bgbj', 'bgbj resa boenard'], - 'upload_date': '20220703', - 'duration': 650, + 'release_timestamp': 1663063704, + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/13/leonardo-dicaprio_169.png?w=600&q=90', + 'age_limit': 0, + 'description': 'Aktor Leonardo DiCaprio memang baru saja putus dari kekasihnya yang bernama Camilla Morrone.', + 'release_date': '20220913', + 'title': 'Diincar Leonardo DiCaprio, Gigi Hadid Ngaku Tertarik Tapi Belum Cinta', + 'tags': ['leonardo dicaprio', ' gigi hadid', ' hollywood'], + 'uploader': '!nsertlive', } }, { - # viral - 'url': 'https://20.detik.com/viral/20220603-220603135/merasakan-bus-imut-tanpa-pengemudi-muter-muter-di-kawasan-bsd-city', + # beautynesia embed + 'url': 'https://www.beautynesia.id/embed/video/261636', 'info_dict': { - 'id': '220603135', + 'id': '261636', 'ext': 'mp4', - 'description': 'md5:4771fe101aa303edb829c59c26f9e7c6', - 'timestamp': 1654304305, - 'title': 'Merasakan Bus Imut Tanpa Pengemudi, Muter-muter di Kawasan BSD City', - 'tags': ['viral', 'autonomous vehicle', 'electric', 'shuttle bus'], - 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/06/03/VIRAL_BUS_NO_SUPIR-20220604004707-custom.jpg?w=650&q=80', - 'duration': 593, - 'upload_date': '20220604', + 'age_limit': 0, + 'release_timestamp': 1662375600, + 'description': 'Menurut ramalan astrologi, tiga zodiak ini bakal hoki sepanjang September 2022.', + 'title': '3 Zodiak Paling Beruntung Selama September 2022', + 'release_date': '20220905', + 'tags': ['zodiac update', ' zodiak', ' ramalan bintang', ' zodiak beruntung 2022', ' zodiak hoki september 2022', ' zodiak beruntung september 2022'], + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/05/3-zodiak-paling-beruntung-selama-september-2022_169.jpeg?w=600&q=90', + 'uploader': 'amh', + } + }, { + # cnbcindonesia embed + 'url': 'https://www.cnbcindonesia.com/embed/video/371839', + 'info_dict': { + 'id': '371839', + 'ext': 'mp4', + 'title': 'Puluhan Pejabat Rusia Tuntut Putin Mundur', + 'tags': ['putin'], + 'age_limit': 0, + 'thumbnail': 'https://awsimages.detik.net.id/visual/2022/09/13/cnbc-indonesia-tv-3_169.png?w=600&q=80', + 'description': 'md5:8b9111e37555fcd95fe549a9b4ae6fdc', + } + }, { + # detik shortlink (we can get it from https://dtk.id/?<url>) + 'url': 'https://dtk.id/NkISKr', + 'info_dict': { + 'id': '220914049', + 'ext': 'mp4', + 'release_timestamp': 1663114488, + 'uploader': 'Tim 20Detik', + 'title': 'Pakar Bicara soal Tim Khusus Jokowi dan Mereka yang Pro ke Bjorka', + 'age_limit': 0, + 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/09/14/f15cae71d7b640c58e75b254ecbb1ce1-20220914071613-0s.jpg?w=400&q=80', + 'display_id': 'pakar-bicara-soal-tim-khusus-jokowi-dan-mereka-yang-pro-ke-bjorka', + 'upload_date': '20220914', + 'release_date': '20220914', + 'description': 'md5:5eb03225f7ee40207dd3a1e18a73f1ff', + 'timestamp': 1663139688, + 'duration': 213.0, + 'tags': ['hacker bjorka', 'bjorka', 'hacker bjorka bocorkan data rahasia presiden jokowi', 'jokowi'], } }] - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - json_ld_data = self._search_json_ld(webpage, display_id) + def _extract_from_webpage(self, url, webpage): + display_id = url_basename(url) + player_type, video_data = self._search_regex( + r'<script\s*[^>]+src="https?://(aws)?cdn\.detik\.net\.id/(?P<type>flowplayer|detikVideo)[^>]+>\s*(?P<video_data>{[^}]+})', + webpage, 'playerjs', group=('type', 'video_data'), default=(None, '')) - video_url = self._html_search_regex( - r'videoUrl\s*:\s*"(?P<video_url>[^"]+)', webpage, 'videoUrl') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id, ext='mp4') + json_ld_data = self._search_json_ld(webpage, display_id, default={}) + extra_info_dict = {} - return merge_dicts(json_ld_data, { - 'id': self._html_search_meta('video_id', webpage), + if not player_type: + return + + elif player_type == 'flowplayer': + video_json_data = self._parse_json(video_data.replace('\'', '"'), display_id) + video_url = video_json_data['videoUrl'] + + extra_info_dict = { + 'id': self._search_regex(r'identifier\s*:\s*\'([^\']+)', webpage, 'identifier'), + 'thumbnail': video_json_data.get('imageUrl'), + } + + elif player_type == 'detikVideo': + video_url = self._search_regex( + r'videoUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl') + extra_info_dict = { + 'id': self._html_search_meta(['video_id', 'dtk:video_id'], webpage), + 'thumbnail': self._search_regex(r'imageUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl'), + 'duration': int_or_none(self._html_search_meta('duration', webpage, fatal=False, default=None)), + 'release_timestamp': int_or_none(self._html_search_meta('dtk:publishdateunix', webpage, fatal=False, default=None), 1000), + 'timestamp': int_or_none(self._html_search_meta('dtk:createdateunix', webpage, fatal=False, default=None), 1000), + 'uploader': self._search_regex( + r'([^-]+)', self._html_search_meta('dtk:author', webpage, default='').strip(), 'uploader', + default=None) + } + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) + self._sort_formats(formats) + + yield merge_dicts(json_ld_data, extra_info_dict, { + 'display_id': display_id, + 'title': self._html_search_meta(['og:title', 'originalTitle'], webpage) or self._html_extract_title(webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage), 'formats': formats, 'subtitles': subtitles, - 'tags': str_or_none(self._html_search_meta(['keywords', 'keyword', 'dtk:keywords'], webpage), '').split(','), + 'tags': try_call(lambda: self._html_search_meta( + ['keywords', 'keyword', 'dtk:keywords'], webpage).split(',')), }) From c53e5cf59fb73769faa97516d70cff7fca39185b Mon Sep 17 00:00:00 2001 From: jhwgh1968 <jhwgh1968@protonmail.com> Date: Tue, 4 Oct 2022 03:16:01 +0000 Subject: [PATCH 227/284] [extractor/redgifs] Fix extractor (#4892) Closes #4805 Authored by: jhwgh1968 --- yt_dlp/extractor/redgifs.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index e3712a1d6b..3181cd409c 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -18,6 +18,12 @@ class RedGifsBaseInfoExtractor(InfoExtractor): 'hd': None, } + _API_HEADERS = { + 'referer': 'https://www.redgifs.com/', + 'origin': 'https://www.redgifs.com', + 'content-type': 'application/json', + } + def _parse_gif_data(self, gif_data): video_id = gif_data.get('id') quality = qualities(tuple(self._FORMATS.keys())) @@ -43,7 +49,7 @@ def _parse_gif_data(self, gif_data): return { 'id': video_id, 'webpage_url': f'https://redgifs.com/watch/{video_id}', - 'ie_key': RedGifsIE.ie_key(), + 'extractor_key': RedGifsIE.ie_key(), 'extractor': 'RedGifs', 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs', 'timestamp': int_or_none(gif_data.get('createDate')), @@ -57,9 +63,29 @@ def _parse_gif_data(self, gif_data): 'formats': formats, } + def _fetch_oauth_token(self, video_id): + # These pages contain the OAuth token that is necessary to make API calls. + index_page = self._download_webpage(f'https://www.redgifs.com/watch/{video_id}', video_id) + index_js_uri = self._html_search_regex( + r'href="?(/assets/js/index[.a-z0-9]*.js)"?\W', index_page, 'index_js_uri') + index_js = self._download_webpage(f'https://www.redgifs.com/{index_js_uri}', video_id) + # It turns out that a { followed by any valid JSON punctuation will always result in the + # first two characters of the base64 encoding being "ey". + # Use this fact to find any such string constant of a reasonable length with the correct + # punctuation for an oauth token + oauth_token = self._html_search_regex( + r'\w+\s*[=:]\s*"(ey[^"]+\.[^"]*\.[^"]{43,45})"', index_js, 'oauth token') + self._API_HEADERS['authorization'] = f'Bearer {oauth_token}' + def _call_api(self, ep, video_id, *args, **kwargs): + if 'authorization' not in self._API_HEADERS: + self._fetch_oauth_token(video_id) + assert 'authorization' in self._API_HEADERS + + headers = dict(self._API_HEADERS) + headers['x-customheader'] = f'https://www.redgifs.com/watch/{video_id}' data = self._download_json( - f'https://api.redgifs.com/v2/{ep}', video_id, *args, **kwargs) + f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs) if 'error' in data: raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id) return data @@ -102,6 +128,7 @@ class RedGifsIE(RedGifsBaseInfoExtractor): 'like_count': int, 'categories': list, 'age_limit': 18, + 'tags': list, } }, { 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', @@ -117,13 +144,14 @@ class RedGifsIE(RedGifsBaseInfoExtractor): 'like_count': int, 'categories': list, 'age_limit': 18, + 'tags': list, } }] def _real_extract(self, url): video_id = self._match_id(url).lower() video_info = self._call_api( - f'gifs/{video_id}', video_id, note='Downloading video info') + f'gifs/{video_id}?views=yes', video_id, note='Downloading video info') return self._parse_gif_data(video_info['gif']) From 7f5b3cb8b39c8e73f6c45d521059622b1e140b33 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Tue, 4 Oct 2022 12:18:26 +0900 Subject: [PATCH 228/284] [extractor/booyah] Add extractor (#4834) Closes #4583 Authored by: HobbyistDev, elyse0 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/booyah.py | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 yt_dlp/extractor/booyah.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8e9cfd8fb2..b14047b110 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -220,6 +220,7 @@ from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE +from .booyah import BooyahClipsIE from .bpb import BpbIE from .br import ( BRIE, diff --git a/yt_dlp/extractor/booyah.py b/yt_dlp/extractor/booyah.py new file mode 100644 index 0000000000..8c94714be0 --- /dev/null +++ b/yt_dlp/extractor/booyah.py @@ -0,0 +1,87 @@ +from .common import InfoExtractor +from ..utils import int_or_none, str_or_none, traverse_obj + + +class BooyahBaseIE(InfoExtractor): + _BOOYAH_SESSION_KEY = None + + def _real_initialize(self): + BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( + 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') + + def _get_comments(self, video_id): + comment_json = self._download_json( + f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, + headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} + + return [{ + 'id': comment.get('comment_id'), + 'author': comment.get('from_nickname'), + 'author_id': comment.get('from_uid'), + 'author_thumbnail': comment.get('from_thumbnail'), + 'text': comment.get('content'), + 'timestamp': comment.get('create_time'), + 'like_count': comment.get('like_cnt'), + } for comment in comment_json.get('comment_list') or ()] + + +class BooyahClipsIE(BooyahBaseIE): + _VALID_URL = r'https?://booyah.live/clips/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://booyah.live/clips/13887261322952306617', + 'info_dict': { + 'id': '13887261322952306617', + 'ext': 'mp4', + 'view_count': int, + 'duration': 30, + 'channel_id': 90565760, + 'like_count': int, + 'title': 'Cayendo con estilo 😎', + 'uploader': '♡LɪꜱGΛ​MER​', + 'comment_count': int, + 'uploader_id': '90565760', + 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', + 'upload_date': '20220617', + 'timestamp': 1655490556, + 'modified_timestamp': 1655490556, + 'modified_date': '20220617', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, + headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) + + formats = [] + for video_data in json_data['playback']['endpoint_list']: + formats.extend(({ + 'url': video_data.get('stream_url'), + 'ext': 'mp4', + 'height': video_data.get('resolution'), + }, { + 'url': video_data.get('download_url'), + 'ext': 'mp4', + 'format_note': 'Watermarked', + 'height': video_data.get('resolution'), + 'preference': -10, + })) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('playback', 'name')), + 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), + 'formats': formats, + 'view_count': traverse_obj(json_data, ('playback', 'views')), + 'like_count': traverse_obj(json_data, ('playback', 'likes')), + 'duration': traverse_obj(json_data, ('playback', 'duration')), + 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), + 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), + 'uploader': traverse_obj(json_data, ('user', 'nickname')), + 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), + 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), + 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), + '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), + } From 1e0daeb314f0644eed5cdd638b6cc5452a6bbab5 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Tue, 4 Oct 2022 16:29:29 +1300 Subject: [PATCH 229/284] [extractor/24tv.ua] Add extractors (#5121) Closes #4287 Authored by: coletdjnz --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/tv24ua.py | 146 ++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 yt_dlp/extractor/tv24ua.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b14047b110..2804886cda 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1875,6 +1875,10 @@ KatsomoIE, MTVUutisetArticleIE, ) +from .tv24ua import ( + TV24UAVideoIE, + TV24UAGenericPassthroughIE +) from .tv2dk import ( TV2DKIE, TV2DKBornholmPlayIE, diff --git a/yt_dlp/extractor/tv24ua.py b/yt_dlp/extractor/tv24ua.py new file mode 100644 index 0000000000..723049e781 --- /dev/null +++ b/yt_dlp/extractor/tv24ua.py @@ -0,0 +1,146 @@ +import base64 +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + get_elements_html_by_class, + js_to_json, + mimetype2ext, + smuggle_url, + traverse_obj, +) + + +class TV24UAVideoIE(InfoExtractor): + _VALID_URL = r'https?://24tv\.ua/news/showPlayer\.do.*?(?:\?|&)objectId=(?P<id>\d+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?'] + IE_NAME = '24tv.ua' + _TESTS = [{ + 'url': 'https://24tv.ua/news/showPlayer.do?objectId=2074790&videoUrl=2022/07/2074790&w=640&h=360', + 'info_dict': { + 'id': '2074790', + 'ext': 'mp4', + 'title': 'У Харкові ворожа ракета прилетіла в будинок, де слухали пісні про "офіцерів-росіян"', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, { + 'url': 'https://24tv.ua/news/showPlayer.do?videoUrl=2022/07/2074790&objectId=2074790&w=640&h=360', + 'only_matching': True, + }] + + _WEBPAGE_TESTS = [ + { + # iframe embed created from share menu. + 'url': 'data:text/html,%3Ciframe%20src=%22https://24tv.ua/news/showPlayer.do?objectId=1886193&videoUrl' + '=2022/03/1886193&w=640&h=360%22%20width=%22640%22%20height=%22360%22%20frameborder=%220%22' + '%20scrolling=%22no%22%3E%3C/iframe%3E', + 'info_dict': { + 'id': '1886193', + 'ext': 'mp4', + 'title': 'Росіяни руйнують Бородянку на Київщині та стріляють з літаків по мешканцях: шокуючі фото', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, + { + 'url': 'https://24tv.ua/vipalyuyut-nashi-mista-sela-dsns-pokazali-motoroshni-naslidki_n1883966', + 'info_dict': { + 'id': '1883966', + 'ext': 'mp4', + 'title': 'Випалюють наші міста та села, – моторошні наслідки обстрілів на Чернігівщині', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'params': {'allowed_extractors': ['Generic', '24tv.ua']}, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = [] + subtitles = {} + for j in re.findall(r'vPlayConfig\.sources\s*=\s*(?P<json>\[{\s*(?s:.+?)\s*}])', webpage): + sources = self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or [] + for source in sources: + if mimetype2ext(traverse_obj(source, 'type')) == 'm3u8': + f, s = self._extract_m3u8_formats_and_subtitles(source['src'], video_id) + formats.extend(f) + self._merge_subtitles(subtitles, s) + else: + formats.append({ + 'url': source['src'], + 'ext': determine_ext(source['src']), + }) + thumbnail = traverse_obj( + self._search_json( + r'var\s*vPlayConfig\s*=\s*', webpage, 'thumbnail', + video_id, default=None, transform_source=js_to_json), 'poster') + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': thumbnail or self._og_search_thumbnail(webpage), + 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + } + + +class TV24UAGenericPassthroughIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z0-9]+?\.)?24tv\.ua/(?P<id>[^/]+?_n\d+)' + + _TESTS = [{ + # Generic iframe, not within media_embed + 'url': 'https://24tv.ua/vipalyuyut-nashi-mista-sela-dsns-pokazali-motoroshni-naslidki_n1883966', + 'info_dict': { + 'id': '1883966', + 'ext': 'mp4', + 'title': 'Випалюють наші міста та села, – моторошні наслідки обстрілів на Чернігівщині', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, { + # Generic iframe embed of TV24UAPlayerIE, within media_embed + 'url': 'https://24tv.ua/harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', + 'info_dict': { + 'id': 'harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', + 'title': 'Харків\'яни згадують місто до війни: щемливе відео' + }, + 'playlist': [{ + 'info_dict': { + 'id': '1887584', + 'ext': 'mp4', + 'title': 'Харків\'яни згадують місто до війни: щемливе відео', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + }] + }, { + # 2 media_embeds with YouTube iframes + 'url': 'https://24tv.ua/bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'info_dict': { + 'id': 'bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'title': 'Броньовик Wolfhound: гігант, який допомагає ЗСУ знищувати окупантів на фронті', + }, + 'playlist_count': 2 + }, { + 'url': 'https://men.24tv.ua/fitnes-bloger-sprobuvav-vikonati-trenuvannya-naysilnishoyi-lyudini_n2164538', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data_urls = [] + # The site contains escaped iframe embeds within an attribute. + # Once escaped, generic can handle them, so we use a data url to pass the escaped html back. + for html in get_elements_html_by_class('media_embed', webpage): + data = urllib.parse.unquote(extract_attributes(html).get('data-html')) + data_urls.append(f'data:text/html;base64,{base64.b64encode(data.encode("utf-8")).decode("utf-8")}') + + if not data_urls: + return self.url_result(url, 'Generic') + return self.playlist_from_matches( + [smuggle_url(url, {'to_generic': True}) for url in data_urls], display_id, ie='Generic', + playlist_title=self._og_search_title(webpage) or self._html_extract_title(webpage)) From 143a2ccab39a4e6477521f0d563f940a97fa9dc6 Mon Sep 17 00:00:00 2001 From: columndeeply <106948293+columndeeply@users.noreply.github.com> Date: Tue, 4 Oct 2022 05:33:46 +0200 Subject: [PATCH 230/284] [extractor/prankcast] Add extractor (#4774) Authored by: columndeeply, HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/prankcast.py | 49 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 yt_dlp/extractor/prankcast.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2804886cda..3ecd7748bf 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1382,6 +1382,7 @@ PuhuTVIE, PuhuTVSerieIE, ) +from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE from .projectveritas import ProjectVeritasIE diff --git a/yt_dlp/extractor/prankcast.py b/yt_dlp/extractor/prankcast.py new file mode 100644 index 0000000000..7446caf3c0 --- /dev/null +++ b/yt_dlp/extractor/prankcast.py @@ -0,0 +1,49 @@ +from .common import InfoExtractor +from ..utils import parse_iso8601, traverse_obj, try_call + + +class PrankCastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/showreel/(?P<id>\d+)-(?P<display_id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://prankcast.com/Devonanustart/showreel/1561-Beverly-is-back-like-a-heart-attack-', + 'info_dict': { + 'id': '1561', + 'ext': 'mp3', + 'title': 'Beverly is back like a heart attack!', + 'display_id': 'Beverly-is-back-like-a-heart-attack-', + 'timestamp': 1661391575, + 'uploader': 'Devonanustart', + 'channel_id': 4, + 'duration': 7918, + 'cast': ['Devonanustart', 'Phonelosers'], + 'description': '', + 'categories': ['prank'], + 'tags': ['prank call', 'prank'], + 'upload_date': '20220825' + } + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + + webpage = self._download_webpage(url, video_id) + json_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_showreel'] + + uploader = json_info.get('user_name') + guests_json = self._parse_json(json_info.get('guests_json') or '{}', video_id) + start_date = parse_iso8601(json_info.get('start_date')) + + return { + 'id': video_id, + 'title': json_info.get('broadcast_title') or self._og_search_title(webpage), + 'display_id': display_id, + 'url': f'{json_info["broadcast_url"]}{json_info["recording_hash"]}.mp3', + 'timestamp': start_date, + 'uploader': uploader, + 'channel_id': json_info.get('user_id'), + 'duration': try_call(lambda: parse_iso8601(json_info['end_date']) - start_date), + 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), + 'description': json_info.get('broadcast_description'), + 'categories': [json_info.get('broadcast_category')], + 'tags': self._parse_json(json_info.get('broadcast_tags') or '{}', video_id) + } From 34859e4b32a7c2c74a54c6734678e8513885da43 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Tue, 4 Oct 2022 17:14:57 +1300 Subject: [PATCH 231/284] [extractor/onenewsnz] Add extractor (#5088) Authored by: coletdjnz --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/onenewsnz.py | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 yt_dlp/extractor/onenewsnz.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3ecd7748bf..44c189f797 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1234,6 +1234,7 @@ from .on24 import On24IE from .ondemandkorea import OnDemandKoreaIE from .onefootball import OneFootballIE +from .onenewsnz import OneNewsNZIE from .onet import ( OnetIE, OnetChannelIE, diff --git a/yt_dlp/extractor/onenewsnz.py b/yt_dlp/extractor/onenewsnz.py new file mode 100644 index 0000000000..59d4490d0f --- /dev/null +++ b/yt_dlp/extractor/onenewsnz.py @@ -0,0 +1,112 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + traverse_obj +) + + +class OneNewsNZIE(InfoExtractor): + IE_NAME = '1News' + IE_DESC = '1news.co.nz article videos' + _VALID_URL = r'https?://(?:www\.)?(?:1|one)news\.co\.nz/\d+/\d+/\d+/(?P<id>[^/?#&]+)' + _TESTS = [ + { # Brightcove video + 'url': 'https://www.1news.co.nz/2022/09/29/cows-painted-green-on-parliament-lawn-in-climate-protest/', + 'info_dict': { + 'id': 'cows-painted-green-on-parliament-lawn-in-climate-protest', + 'title': '\'Cows\' painted green on Parliament lawn in climate protest', + }, + 'playlist': [{ + 'info_dict': { + 'id': '6312993358112', + 'title': 'Activists dressed as cows painted green outside Parliament in climate protest', + 'ext': 'mp4', + 'tags': 'count:6', + 'uploader_id': '963482464001', + 'timestamp': 1664416255, + 'upload_date': '20220929', + 'duration': 38.272, + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Greenpeace accused the Government of "greenwashing" instead of taking climate action.', + } + }] + }, { + # YouTube video + 'url': 'https://www.1news.co.nz/2022/09/30/now-is-the-time-to-care-about-womens-rugby/', + 'info_dict': { + 'id': 'now-is-the-time-to-care-about-womens-rugby', + 'title': 'Now is the time to care about women\'s rugby', + }, + 'playlist': [{ + 'info_dict': { + 'id': 's4wEB9neTfU', + 'title': 'Why I love women’s rugby: Black Fern Ruahei Demant', + 'ext': 'mp4', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UC2BQ3U9IxoYIJyulv0bN5PQ', + 'tags': 'count:12', + 'uploader': 'Re: News', + 'upload_date': '20211215', + 'uploader_id': 'UC2BQ3U9IxoYIJyulv0bN5PQ', + 'uploader_url': 'http://www.youtube.com/channel/UC2BQ3U9IxoYIJyulv0bN5PQ', + 'channel_id': 'UC2BQ3U9IxoYIJyulv0bN5PQ', + 'channel': 'Re: News', + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/s4wEB9neTfU/maxresdefault.jpg', + 'age_limit': 0, + 'view_count': int, + 'categories': ['Sports'], + 'duration': 222, + 'description': 'md5:8874410e5740ed1d8fd0df839f849813', + 'availability': 'public', + 'playable_in_embed': True, + 'live_status': 'not_live', + } + }] + }, { + # 2 Brightcove videos + 'url': 'https://www.1news.co.nz/2022/09/29/raw-videos-capture-hurricane-ians-fury-as-it-slams-florida/', + 'info_dict': { + 'id': 'raw-videos-capture-hurricane-ians-fury-as-it-slams-florida', + 'title': 'Raw videos capture Hurricane Ian\'s fury as it slams Florida', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://www.onenews.co.nz/2022/09/29/cows-painted-green-on-parliament-lawn-in-climate-protest/', + 'only_matching': True, + }] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/0xpHIR6IB_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + fusion_metadata = self._search_json(r'Fusion\.globalContent\s*=', webpage, 'fusion metadata', display_id) + + entries = [] + for item in traverse_obj(fusion_metadata, 'content_elements') or []: + item_type = traverse_obj(item, 'subtype') + if item_type == 'video': + brightcove_config = traverse_obj(item, ('embed', 'config')) + brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % ( + traverse_obj(brightcove_config, 'brightcoveAccount') or '963482464001', + traverse_obj(brightcove_config, 'brightcoveVideoId') + ) + entries.append(self.url_result(brightcove_url, BrightcoveNewIE)) + elif item_type == 'youtube': + video_id_or_url = traverse_obj(item, ('referent', 'id'), ('raw_oembed', '_id')) + if video_id_or_url: + entries.append(self.url_result(video_id_or_url, ie='Youtube')) + + if not entries: + raise ExtractorError('This article does not have a video.', expected=True) + + playlist_title = ( + traverse_obj(fusion_metadata, ('headlines', 'basic')) + or self._og_search_title(webpage) + or self._html_extract_title(webpage) + ) + return self.playlist_result(entries, display_id, playlist_title) From 878eac3e2e3dfc0b811e9575056d89e19e060e79 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 4 Oct 2022 09:49:18 +0530 Subject: [PATCH 232/284] [docs] Separate notes about environment variables --- README.md | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 8f93ba415a..f0d2686df8 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ * [Extractor Options](#extractor-options) * [CONFIGURATION](#configuration) * [Authentication with .netrc file](#authentication-with-netrc-file) + * [Notes about environment variables](#notes-about-environment-variables) * [OUTPUT TEMPLATE](#output-template) * [Output template examples](#output-template-examples) * [FORMAT SELECTION](#format-selection) @@ -679,8 +680,7 @@ ## Filesystem Options: --cache-dir DIR Location in the filesystem where yt-dlp can store some downloaded information (such as client ids and signatures) permanently. By - default $XDG_CACHE_HOME/yt-dlp or - ~/.cache/yt-dlp + default ${XDG_CACHE_HOME}/yt-dlp --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files @@ -1088,20 +1088,25 @@ # CONFIGURATION You can configure yt-dlp by placing any supported command line option to a configuration file. The configuration is loaded from the following locations: -1. **Main Configuration**: The file given by `--config-location` -1. **Portable Configuration**: `yt-dlp.conf` in the same directory as the bundled binary. If you are running from source-code (`<root dir>/yt_dlp/__main__.py`), the root directory is used instead. -1. **Home Configuration**: `yt-dlp.conf` in the home path given by `-P`, or in the current directory if no such path is given +1. **Main Configuration**: + * The file given by `--config-location` +1. **Portable Configuration**: (Recommended for portable installations) + * If using a binary, `yt-dlp.conf` in the same directory as the binary + * If running from source-code, `yt-dlp.conf` in the parent directory of `yt_dlp` +1. **Home Configuration**: + * `yt-dlp.conf` in the home path given by `-P` + * If `-P` is not given, the current directory is searched 1. **User Configuration**: - * `$XDG_CONFIG_HOME/yt-dlp/config` (recommended on Linux/macOS) - * `$XDG_CONFIG_HOME/yt-dlp.conf` - * `$APPDATA/yt-dlp/config` (recommended on Windows) - * `$APPDATA/yt-dlp/config.txt` + * `${XDG_CONFIG_HOME}/yt-dlp/config` (recommended on Linux/macOS) + * `${XDG_CONFIG_HOME}/yt-dlp.conf` + * `${APPDATA}/yt-dlp/config` (recommended on Windows) + * `${APPDATA}/yt-dlp/config.txt` * `~/yt-dlp.conf` * `~/yt-dlp.conf.txt` - - `$XDG_CONFIG_HOME` defaults to `~/.config` if undefined. On windows, `$APPDATA` generally points to `C:\Users\<user name>\AppData\Roaming` and `~` points to `$HOME` if present, `$USERPROFILE` (generally `C:\Users\<user name>`), or `${HOMEDRIVE}${HOMEPATH}` -1. **System Configuration**: `/etc/yt-dlp.conf` + See also: [Notes about environment variables](#notes-about-environment-variables) +1. **System Configuration**: + * `/etc/yt-dlp.conf` E.g. with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: ``` @@ -1134,8 +1139,8 @@ ### Authentication with `.netrc` file You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: ``` -touch $HOME/.netrc -chmod a-rwx,u+rw $HOME/.netrc +touch ${HOME}/.netrc +chmod a-rwx,u+rw ${HOME}/.netrc ``` After that you can add credentials for an extractor in the following format, where *extractor* is the name of the extractor in lowercase: ``` @@ -1148,7 +1153,14 @@ ### Authentication with `.netrc` file ``` To activate authentication with the `.netrc` file you should pass `--netrc` to yt-dlp or place it in the [configuration file](#configuration). -The default location of the .netrc file is `$HOME` (`~`). On Windows, if `$HOME` is not present, `$USERPROFILE` (generally `C:\Users\<user name>`) or `${HOMEDRIVE}${HOMEPATH}` is used +The default location of the .netrc file is `~` (see below). + +### Notes about environment variables +* Environment variables are normally specified as `${VARIABLE}`/`$VARIABLE` on UNIX and `%VARIABLE%` on Windows; but is always shown as `${VARIABLE}` in this documentation +* yt-dlp also allow using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location` +* If unset, `${XDG_CONFIG_HOME}` defaults to `~/.config` and `${XDG_CACHE_HOME}` to `~/.cache` +* On Windows, `~` points to `${HOME}` if present; or, `${USERPROFILE}` or `${HOMEDRIVE}${HOMEPATH}` otherwise +* On Windows, `${USERPROFILE}` generally points to `C:\Users\<user name>` and `${APPDATA}` to `${USERPROFILE}\AppData\Roaming` # OUTPUT TEMPLATE From 304ad45a9b18cba7b62e7cb435fb0ddc49003ed7 Mon Sep 17 00:00:00 2001 From: gamer191 <83270075+gamer191@users.noreply.github.com> Date: Tue, 4 Oct 2022 15:23:11 +1100 Subject: [PATCH 233/284] [cleanup] Misc (#5044) Authored by: gamer191, pukkandan --- .gitignore | 5 ++++- Makefile | 4 ++-- README.md | 33 +++++++++++++++--------------- yt_dlp/extractor/acfun.py | 6 +++--- yt_dlp/extractor/anvato.py | 4 ++-- yt_dlp/extractor/audioboom.py | 7 +------ yt_dlp/extractor/bandcamp.py | 4 ++-- yt_dlp/extractor/hrfensehen.py | 2 +- yt_dlp/extractor/huya.py | 2 +- yt_dlp/extractor/iltalehti.py | 2 +- yt_dlp/extractor/instagram.py | 2 +- yt_dlp/extractor/liputan6.py | 2 +- yt_dlp/extractor/microsoftembed.py | 6 +----- yt_dlp/extractor/nbc.py | 2 +- yt_dlp/extractor/rcs.py | 4 ++-- yt_dlp/extractor/trovo.py | 2 +- yt_dlp/extractor/tviplayer.py | 2 +- yt_dlp/extractor/yandexvideo.py | 2 +- yt_dlp/options.py | 4 +++- yt_dlp/utils.py | 8 ++++---- 20 files changed, 50 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index 2e84762bcd..0ce059b34d 100644 --- a/.gitignore +++ b/.gitignore @@ -33,13 +33,14 @@ cookies *.jpeg *.jpg *.m4a -*.mpga *.m4v *.mhtml *.mkv *.mov *.mp3 *.mp4 +*.mpga +*.oga *.ogg *.opus *.png @@ -47,6 +48,7 @@ cookies *.srt *.swf *.swp +*.tt *.ttml *.url *.vtt @@ -85,6 +87,7 @@ updates_key.pem .tox *.class *.isorted +*.stackdump # Generated AUTHORS diff --git a/Makefile b/Makefile index 3b97c74079..8f335927d0 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,8 @@ pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites \ clean-test: rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ *.frag.aria2 *.frag.urls *.info.json *.live_chat.json *.meta *.part* *.tmp *.temp *.unknown_video *.ytdl \ - *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.jpeg *.jpg *.m4a *.mpga *.m4v *.mhtml *.mkv *.mov \ - *.mp3 *.mp4 *.ogg *.opus *.png *.sbv *.srt *.swf *.swp *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp + *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.jpeg *.jpg *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 *.mp4 \ + *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.swf *.swp *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp clean-dist: rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \ yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS .mailmap diff --git a/README.md b/README.md index f0d2686df8..e0a1ea059b 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ * [SponsorBlock Options](#sponsorblock-options) * [Extractor Options](#extractor-options) * [CONFIGURATION](#configuration) + * [Configuration file encoding](#configuration-file-encoding) * [Authentication with .netrc file](#authentication-with-netrc-file) * [Notes about environment variables](#notes-about-environment-variables) * [OUTPUT TEMPLATE](#output-template) @@ -75,7 +76,7 @@ # NEW FEATURES * Merged with **youtube-dl v2021.12.17+ [commit/ed5c44e](https://github.com/ytdl-org/youtube-dl/commit/ed5c44e7b74ac77f87ca5ed6cb5e964a0c6a0678)**<!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) -* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API +* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API * **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) @@ -89,7 +90,7 @@ # NEW FEATURES * `255kbps` audio is extracted (if available) from YouTube Music when premium cookies are given * Redirect channel's home URL automatically to `/video` to preserve the old behaviour -* **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE]` +* **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` * **Download time range**: Videos can be downloaded partially based on either timestamps or chapters using `--download-sections` @@ -141,8 +142,8 @@ ### Differences in default behavior * `playlist_index` behaves differently when used with options like `--playlist-reverse` and `--playlist-items`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details. You can use `--compat-options playlist-index` if you want to keep the earlier behavior * The output of `-F` is listed in a new format. Use `--compat-options list-formats` to revert this * Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading -* Youtube channel URLs are automatically redirected to `/video`. Append a `/featured` to the URL to download only the videos in the home page. If the channel does not have a videos tab, we try to download the equivalent `UU` playlist instead. For all other tabs, if the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections -* Unavailable videos are also listed for youtube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this +* YouTube channel URLs are automatically redirected to `/video`. Append a `/featured` to the URL to download only the videos in the home page. If the channel does not have a videos tab, we try to download the equivalent `UU` playlist instead. For all other tabs, if the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections +* Unavailable videos are also listed for YouTube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this * The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead @@ -303,7 +304,7 @@ ### Related scripts * **`devscripts/set-variant.py variant [-M update_message]`** - Set the build variant of the executable * **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS=1` if you wish to forcefully disable lazy extractor loading. -You can also fork the project on github and run your fork's [build workflow](.github/workflows/build.yml) to automatically build a full release +You can also fork the project on GitHub and run your fork's [build workflow](.github/workflows/build.yml) to automatically build a full release # USAGE AND OPTIONS @@ -1129,15 +1130,15 @@ # Save all videos under YouTube directory in your home directory You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded. -### Config file encoding +### Configuration file encoding -The config files are decoded according to the UTF BOM if present, and in the encoding from system locale otherwise. +The configuration files are decoded according to the UTF BOM if present, and in the encoding from system locale otherwise. If you want your file to be decoded differently, add `# coding: ENCODING` to the beginning of the file (e.g. `# coding: shift-jis`). There must be no characters before that, even spaces or BOM. ### Authentication with `.netrc` file -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per-extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: ``` touch ${HOME}/.netrc chmod a-rwx,u+rw ${HOME}/.netrc @@ -1184,7 +1185,7 @@ # OUTPUT TEMPLATE 1. **Alternatives**: Alternate fields can be specified separated with a `,`. E.g. `%(release_date>%Y,upload_date>%Y|Unknown)s` -1. **Replacement**: A replacement value can specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. +1. **Replacement**: A replacement value can be specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. 1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s` @@ -1411,7 +1412,7 @@ # FORMAT SELECTION You can select the n'th best format of a type by using `best<type>.<n>`. For example, `best.2` will select the 2nd best combined format. Similarly, `bv*.3` will select the 3rd best format that contains a video stream. -If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes. Note that formats on the left hand side are preferred; e.g. `-f 22/17/18` will download format 22 if it's available, otherwise it will download format 17 if it's available, otherwise it will download format 18 if it's available, otherwise it will complain that no suitable formats are available for download. +If you want to download multiple videos, and they don't have the same formats available, you can specify the order of preference using slashes. Note that formats on the left hand side are preferred; e.g. `-f 22/17/18` will download format 22 if it's available, otherwise it will download format 17 if it's available, otherwise it will download format 18 if it's available, otherwise it will complain that no suitable formats are available for download. If you want to download several formats of the same video use a comma as a separator, e.g. `-f 22,17,18` will download all these three formats, of course if they are available. Or a more sophisticated example combined with the precedence feature: `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. @@ -1419,7 +1420,7 @@ # FORMAT SELECTION **Deprecation warning**: Since the *below* described behavior is complex and counter-intuitive, this will be removed and multistreams will be enabled by default in the future. A new operator will be instead added to limit formats to single audio/video -Unless `--video-multistreams` is used, all formats with a video stream except the first one are ignored. Similarly, unless `--audio-multistreams` is used, all formats with an audio stream except the first one are ignored. E.g. `-f bestvideo+best+bestaudio --video-multistreams --audio-multistreams` will download and merge all 3 given formats. The resulting file will have 2 video streams and 2 audio streams. But `-f bestvideo+best+bestaudio --no-video-multistreams` will download and merge only `bestvideo` and `bestaudio`. `best` is ignored since another format containing a video stream (`bestvideo`) has already been selected. The order of the formats is therefore important. `-f best+bestaudio --no-audio-multistreams` will download and merge both formats while `-f bestaudio+best --no-audio-multistreams` will ignore `best` and download only `bestaudio`. +Unless `--video-multistreams` is used, all formats with a video stream except the first one are ignored. Similarly, unless `--audio-multistreams` is used, all formats with an audio stream except the first one are ignored. E.g. `-f bestvideo+best+bestaudio --video-multistreams --audio-multistreams` will download and merge all 3 given formats. The resulting file will have 2 video streams and 2 audio streams. But `-f bestvideo+best+bestaudio --no-video-multistreams` will download and merge only `bestvideo` and `bestaudio`. `best` is ignored since another format containing a video stream (`bestvideo`) has already been selected. The order of the formats is therefore important. `-f best+bestaudio --no-audio-multistreams` will download only `best` while `-f bestaudio+best --no-audio-multistreams` will ignore `best` and download only `bestaudio`. ## Filtering Formats @@ -1468,8 +1469,8 @@ ## Sorting Formats The available fields are: - - `hasvid`: Gives priority to formats that has a video stream - - `hasaud`: Gives priority to formats that has a audio stream + - `hasvid`: Gives priority to formats that have a video stream + - `hasaud`: Gives priority to formats that have an audio stream - `ie_pref`: The format preference - `lang`: The language preference - `quality`: The quality of the format @@ -1711,7 +1712,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Language code to prefer translated metadata of this language (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` @@ -1725,11 +1726,11 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) * `approximate_date`: Extract approximate `upload_date` in flat-playlist. This may cause date-based filters to be slightly off #### funimation -* `language`: Languages to extract, e.g. `funimation:language=english,japanese` +* `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` * `version`: The video version to extract - `uncut` or `simulcast` #### crunchyroll -* `language`: Languages to extract, e.g. `crunchyroll:language=jaJp` +* `language`: Audio languages to extract, e.g. `crunchyroll:language=jaJp` * `hardsub`: Which hard-sub versions to extract, e.g. `crunchyroll:hardsub=None,enUS` #### crunchyrollbeta diff --git a/yt_dlp/extractor/acfun.py b/yt_dlp/extractor/acfun.py index 615efd9bb0..92b905fa7c 100644 --- a/yt_dlp/extractor/acfun.py +++ b/yt_dlp/extractor/acfun.py @@ -84,7 +84,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - json_all = self._search_json(r'window.videoInfo\s*=\s*', webpage, 'videoInfo', video_id) + json_all = self._search_json(r'window.videoInfo\s*=', webpage, 'videoInfo', video_id) title = json_all.get('title') video_list = json_all.get('videoList') or [] @@ -164,7 +164,7 @@ def _real_extract(self, url): video_id = f'{video_id}{format_field(ac_idx, template="__%s")}' webpage = self._download_webpage(url, video_id) - json_bangumi_data = self._search_json(r'window.bangumiData\s*=\s*', webpage, 'bangumiData', video_id) + json_bangumi_data = self._search_json(r'window.bangumiData\s*=', webpage, 'bangumiData', video_id) if ac_idx: video_info = json_bangumi_data['hlVideoInfo'] @@ -181,7 +181,7 @@ def _real_extract(self, url): if v.get('id') == season_id), 1) json_bangumi_list = self._search_json( - r'window\.bangumiList\s*=\s*', webpage, 'bangumiList', video_id, fatal=False) + r'window\.bangumiList\s*=', webpage, 'bangumiList', video_id, fatal=False) video_internal_id = int_or_none(traverse_obj(json_bangumi_data, ('currentVideoInfo', 'id'))) episode_number = video_internal_id and next(( idx for idx, v in enumerate(json_bangumi_list.get('items') or [], 1) diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index 5d03070852..0d7575a1f5 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -10,11 +10,11 @@ from ..utils import ( bytes_to_intlist, determine_ext, - intlist_to_bytes, int_or_none, + intlist_to_bytes, join_nonempty, - strip_jsonp, smuggle_url, + strip_jsonp, traverse_obj, unescapeHTML, unsmuggle_url, diff --git a/yt_dlp/extractor/audioboom.py b/yt_dlp/extractor/audioboom.py index f1aa0201b6..a23fcd2999 100644 --- a/yt_dlp/extractor/audioboom.py +++ b/yt_dlp/extractor/audioboom.py @@ -1,10 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, - unescapeHTML, - traverse_obj, -) +from ..utils import clean_html, float_or_none, traverse_obj, unescapeHTML class AudioBoomIE(InfoExtractor): diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 2dae49e770..a864ff9ac7 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -5,16 +5,16 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + KNOWN_EXTENSIONS, ExtractorError, float_or_none, int_or_none, - KNOWN_EXTENSIONS, parse_filesize, str_or_none, try_get, - update_url_query, unified_strdate, unified_timestamp, + update_url_query, url_or_none, urljoin, ) diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py index dd72d86d77..447782019d 100644 --- a/yt_dlp/extractor/hrfensehen.py +++ b/yt_dlp/extractor/hrfensehen.py @@ -1,6 +1,7 @@ import json import re +from .common import InfoExtractor from ..utils import ( int_or_none, traverse_obj, @@ -8,7 +9,6 @@ unescapeHTML, unified_timestamp, ) -from .common import InfoExtractor class HRFernsehenIE(InfoExtractor): diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index 6d6f099561..c05e77c321 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -54,7 +54,7 @@ class HuyaLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - stream_data = self._search_json(r'stream:\s+', webpage, 'stream', video_id=video_id, default=None) + stream_data = self._search_json(r'stream:\s', webpage, 'stream', video_id=video_id, default=None) room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) diff --git a/yt_dlp/extractor/iltalehti.py b/yt_dlp/extractor/iltalehti.py index a40307aed4..0e7e82c9c0 100644 --- a/yt_dlp/extractor/iltalehti.py +++ b/yt_dlp/extractor/iltalehti.py @@ -41,7 +41,7 @@ def _real_extract(self, url): article_id = self._match_id(url) webpage = self._download_webpage(url, article_id) info = self._search_json( - r'<script>\s*window.App\s*=\s*', webpage, 'json', article_id, + r'<script>\s*window.App\s*=', webpage, 'json', article_id, transform_source=js_to_json) props = traverse_obj(info, ( 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index c9da7e36f6..fc08f377cf 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -452,7 +452,7 @@ def _real_extract(self, url): webpage = self._download_webpage( f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) additional_data = self._search_json( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) if not additional_data and not media: self.raise_login_required('Requested content is not available, rate-limit reached or login required') diff --git a/yt_dlp/extractor/liputan6.py b/yt_dlp/extractor/liputan6.py index b5dbffe24a..c4477b93e0 100644 --- a/yt_dlp/extractor/liputan6.py +++ b/yt_dlp/extractor/liputan6.py @@ -57,7 +57,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) json_data = self._search_json( - r'window.kmklabs.gtm\s*=\s*', webpage, 'json_data', display_id) + r'window.kmklabs.gtm\s*=', webpage, 'json_data', display_id) video_id = json_data['videos']['video_1']['video_id'] return self.url_result( diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 8cdf66778b..1425a0159e 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - int_or_none, - traverse_obj, - unified_timestamp, -) +from ..utils import int_or_none, traverse_obj, unified_timestamp class MicrosoftEmbedIE(InfoExtractor): diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 6b482620a7..3de8c15088 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -643,7 +643,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) nbc_data = self._search_json( - r'<script>var\s*nbc\s*=\s*', webpage, 'NBC JSON data', video_id) + r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id) pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py index e6185fec75..d69a1a216c 100644 --- a/yt_dlp/extractor/rcs.py +++ b/yt_dlp/extractor/rcs.py @@ -2,10 +2,10 @@ from .common import InfoExtractor from ..utils import ( - clean_html, ExtractorError, - js_to_json, base_url, + clean_html, + js_to_json, url_basename, urljoin, ) diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py index f4d4bcd174..b7aa74060a 100644 --- a/yt_dlp/extractor/trovo.py +++ b/yt_dlp/extractor/trovo.py @@ -7,9 +7,9 @@ from ..utils import ( ExtractorError, format_field, - traverse_obj, int_or_none, str_or_none, + traverse_obj, try_get, ) diff --git a/yt_dlp/extractor/tviplayer.py b/yt_dlp/extractor/tviplayer.py index f60cfb050e..7e9b04d55b 100644 --- a/yt_dlp/extractor/tviplayer.py +++ b/yt_dlp/extractor/tviplayer.py @@ -62,7 +62,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) json_data = self._search_json( - r'<script>\s*jsonData\s*=\s*', webpage, 'json_data', video_id) + r'<script>\s*jsonData\s*=', webpage, 'json_data', video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles( f'{json_data["videoUrl"]}?wmsAuthSign={self.wms_auth_sign_token}', diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index eadb1aaeeb..0b621dbd25 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -6,9 +6,9 @@ determine_ext, extract_attributes, int_or_none, + lowercase_escape, try_get, url_or_none, - lowercase_escape, ) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 861bbf7864..5ff375fcfa 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1417,7 +1417,9 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not load cookies from browser (default)') filesystem.add_option( '--cache-dir', dest='cachedir', default=None, metavar='DIR', - help='Location in the filesystem where yt-dlp can store some downloaded information (such as client ids and signatures) permanently. By default $XDG_CACHE_HOME/yt-dlp or ~/.cache/yt-dlp') + help=( + 'Location in the filesystem where yt-dlp can store some downloaded information ' + '(such as client ids and signatures) permanently. By default ${XDG_CACHE_HOME}/yt-dlp')) filesystem.add_option( '--no-cache-dir', action='store_false', dest='cachedir', help='Disable filesystem caching') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 6cba9299a5..d0be7f19ef 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3180,6 +3180,10 @@ def multipart_encode(data, boundary=None): return out, content_type +def variadic(x, allowed_types=(str, bytes, dict)): + return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): for val in map(d.get, variadic(key_or_keys)): if val is not None and (val or not skip_false_values): @@ -5446,10 +5450,6 @@ def get_first(obj, keys, **kwargs): return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) -def variadic(x, allowed_types=(str, bytes, dict)): - return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) - - def time_seconds(**kwargs): t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs))) return t.timestamp() From 4e0511f27d153ee0dbc4da158b4e35add8f7511a Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 4 Oct 2022 10:17:45 +0530 Subject: [PATCH 234/284] Release 2022.10.04 --- CONTRIBUTORS | 22 ++++++++++ Changelog.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++ supportedsites.md | 45 ++++++++++++++------ 3 files changed, 160 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 7859170568..264c087c2f 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -309,3 +309,25 @@ shreyasminocha tejasa97 xenov satan1st +0xGodspeed +5736d79 +587021c +basrieter +Bobscorn +CNugteren +columndeeply +DoubleCouponDay +Fabi019 +GautamMKGarg +Grub4K +itachi-19 +jeroenj +josanabr +LiviaMedeiros +nikita-moor +snapdgn +SuperSonicHub1 +tannertechnology +Timendum +tobi1805 +TokyoBlackHole diff --git a/Changelog.md b/Changelog.md index 561b88ce63..d7600b0463 100644 --- a/Changelog.md +++ b/Changelog.md @@ -11,6 +11,111 @@ # Instuctions for creating release --> +### 2022.10.04 + +* Allow a `set` to be passed as `download_archive` by [pukkandan](https://github.com/pukkandan), [bashonly](https://github.com/bashonly) +* Allow open ranges for time ranges by [Lesmiscore](https://github.com/Lesmiscore) +* Allow plugin extractors to replace the built-in ones +* Don't download entire video when no matching `--download-sections` +* Fix `--config-location -` +* Improve [5736d79](https://github.com/yt-dlp/yt-dlp/pull/5044/commits/5736d79172c47ff84740d5720467370a560febad) +* Fix for when playlists don't have `webpage_url` +* Support environment variables in `--ffmpeg-location` +* Workaround `libc_ver` not be available on Windows Store version of Python +* [outtmpl] Curly braces to filter keys by [pukkandan](https://github.com/pukkandan) +* [outtmpl] Make `%s` work in strfformat for all systems +* [jsinterp] Workaround operator associativity issue +* [cookies] Let `_get_mac_keyring_password` fail gracefully +* [cookies] Parse cookies leniently by [Grub4K](https://github.com/Grub4K) +* [phantomjs] Fix bug in [587021c](https://github.com/yt-dlp/yt-dlp/commit/587021cd9f717181b44e881941aca3f8d753758b) by [elyse0](https://github.com/elyse0) +* [downloader/aria2c] Fix filename containing leading whitespace by [std-move](https://github.com/std-move) +* [downloader/ism] Support ec-3 codec by [nixxo](https://github.com/nixxo) +* [extractor] Fix `fatal=False` in `RetryManager` +* [extractor] Improve json-ld extraction +* [extractor] Make `_search_json` able to parse lists +* [extractor] Escape `%` in `representation_id` of m3u8 +* [extractor/generic] Pass through referer from json-ld +* [utils] `base_url`: URL paths can contain `&` by [elyse0](https://github.com/elyse0) +* [utils] `js_to_json`: Improve +* [utils] `Popen.run`: Fix default return in binary mode +* [utils] `traverse_obj`: Rewrite, document and add tests by [Grub4K](https://github.com/Grub4K) +* [devscripts] `make_lazy_extractors`: Fix for Docker by [josanabr](https://github.com/josanabr) +* [docs] Misc Improvements +* [cleanup] Misc fixes and cleanup by [pukkandan](https://github.com/pukkandan), [gamer191](https://github.com/gamer191) +* [extractor/24tv.ua] Add extractors by [coletdjnz](https://github.com/coletdjnz) +* [extractor/BerufeTV] Add extractor by [Fabi019](https://github.com/Fabi019) +* [extractor/booyah] Add extractor by [HobbyistDev](https://github.com/HobbyistDev), [elyse0](https://github.com/elyse0) +* [extractor/bundesliga] Add extractor by [Fabi019](https://github.com/Fabi019) +* [extractor/GoPlay] Add extractor by [CNugteren](https://github.com/CNugteren), [basrieter](https://github.com/basrieter), [jeroenj](https://github.com/jeroenj) +* [extractor/iltalehti] Add extractor by [tpikonen](https://github.com/tpikonen) +* [extractor/IsraelNationalNews] Add extractor by [Bobscorn](https://github.com/Bobscorn) +* [extractor/mediaworksnzvod] Add extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/MicrosoftEmbed] Add extractor by [DoubleCouponDay](https://github.com/DoubleCouponDay) +* [extractor/nbc] Add NBCStations extractor by [bashonly](https://github.com/bashonly) +* [extractor/onenewsnz] Add extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/prankcast] Add extractor by [HobbyistDev](https://github.com/HobbyistDev), [columndeeply](https://github.com/columndeeply) +* [extractor/Smotrim] Add extractor by [Lesmiscore](https://github.com/Lesmiscore), [nikita-moor](https://github.com/nikita-moor) +* [extractor/tencent] Add Iflix extractor by [elyse0](https://github.com/elyse0) +* [extractor/unscripted] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/adobepass] Add MSO AlticeOne (Optimum TV) by [CplPwnies](https://github.com/CplPwnies) +* [extractor/youtube] **Download `post_live` videos from start** by [Lesmiscore](https://github.com/Lesmiscore), [pukkandan](https://github.com/pukkandan) +* [extractor/youtube] Add support for Shorts audio pivot feed by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor/youtube] Detect `lazy-load-for-videos` embeds +* [extractor/youtube] Do not warn on duplicate chapters +* [extractor/youtube] Fix video like count extraction by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube] Support changing extraction language by [coletdjnz](https://github.com/coletdjnz) +* [extractor/youtube:tab] Improve continuation items extraction +* [extractor/youtube:tab] Support `reporthistory` page +* [extractor/amazonstore] Fix JSON extraction by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [extractor/amazonstore] Retry to avoid captcha page by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/animeondemand] Remove extractor by [TokyoBlackHole](https://github.com/TokyoBlackHole) +* [extractor/anvato] Fix extractor and refactor by [bashonly](https://github.com/bashonly) +* [extractor/artetv] Remove duplicate stream urls by [Grub4K](https://github.com/Grub4K) +* [extractor/audioboom] Support direct URLs and refactor by [pukkandan](https://github.com/pukkandan), [tpikonen](https://github.com/tpikonen) +* [extractor/bandcamp] Extract `uploader_url` +* [extractor/bilibili] Add space.bilibili extractors by [lockmatrix](https://github.com/lockmatrix) +* [extractor/BilibiliSpace] Fix extractor and better error message by [lockmatrix](https://github.com/lockmatrix) +* [extractor/BiliIntl] Support uppercase lang in `_VALID_URL` by [coletdjnz](https://github.com/coletdjnz) +* [extractor/BiliIntlSeries] Fix `_VALID_URL` +* [extractor/bongacams] Update `_VALID_URL` by [0xGodspeed](https://github.com/0xGodspeed) +* [extractor/crunchyroll:beta] Improve handling of hardsubs by [Grub4K](https://github.com/Grub4K) +* [extractor/detik] Generalize extractors by [HobbyistDev](https://github.com/HobbyistDev), [coletdjnz](https://github.com/coletdjnz) +* [extractor/dplay:italy] Add default authentication by [Timendum](https://github.com/Timendum) +* [extractor/heise] Fix extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/holodex] Fix `_VALID_URL` by [LiviaMedeiros](https://github.com/LiviaMedeiros) +* [extractor/hrfensehen] Fix extractor by [snapdgn](https://github.com/snapdgn) +* [extractor/hungama] Add subtitle by [GautamMKGarg](https://github.com/GautamMKGarg), [pukkandan](https://github.com/pukkandan) +* [extractor/instagram] Extract more metadata by [pritam20ps05](https://github.com/pritam20ps05) +* [extractor/JWPlatform] Fix extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/malltv] Fix video_id extraction by [HobbyistDev](https://github.com/HobbyistDev) +* [extractor/MLBTV] Detect live streams +* [extractor/motorsport] Support native embeds +* [extractor/Mxplayer] Fix extractor by [itachi-19](https://github.com/itachi-19) +* [extractor/nebula] Add nebula.tv by [tannertechnology](https://github.com/tannertechnology) +* [extractor/nfl] Fix extractor by [bashonly](https://github.com/bashonly) +* [extractor/ondemandkorea] Update `jw_config` regex by [julien-hadleyjack](https://github.com/julien-hadleyjack) +* [extractor/paramountplus] Better DRM detection by [bashonly](https://github.com/bashonly) +* [extractor/patreon] Sort formats +* [extractor/rcs] Fix embed extraction by [coletdjnz](https://github.com/coletdjnz) +* [extractor/redgifs] Fix extractor by [jhwgh1968](https://github.com/jhwgh1968) +* [extractor/rutube] Fix `_EMBED_REGEX` by [coletdjnz](https://github.com/coletdjnz) +* [extractor/RUTV] Fix warnings for livestreams by [Lesmiscore](https://github.com/Lesmiscore) +* [extractor/soundcloud:search] More metadata in `--flat-playlist` by [SuperSonicHub1](https://github.com/SuperSonicHub1) +* [extractor/telegraaf] Use mobile GraphQL API endpoint by [coletdjnz](https://github.com/coletdjnz) +* [extractor/tennistv] Fix timestamp by [zenerdi0de](https://github.com/zenerdi0de) +* [extractor/tiktok] Fix TikTokIE by [bashonly](https://github.com/bashonly) +* [extractor/triller] Fix auth token by [bashonly](https://github.com/bashonly) +* [extractor/trovo] Fix extractors by [Mehavoid](https://github.com/Mehavoid) +* [extractor/tv2] Support new url format by [tobi1805](https://github.com/tobi1805) +* [extractor/web.archive:youtube] Fix `_YT_INITIAL_PLAYER_RESPONSE_RE` +* [extractor/wistia] Add support for channels by [coletdjnz](https://github.com/coletdjnz) +* [extractor/wistia] Match IDs in embed URLs by [bashonly](https://github.com/bashonly) +* [extractor/wordpress:playlist] Add generic embed extractor by [coletdjnz](https://github.com/coletdjnz) +* [extractor/yandexvideopreview] Update `_VALID_URL` by [Grub4K](https://github.com/Grub4K) +* [extractor/zee5] Fix `_VALID_URL` by [m4tu4g](https://github.com/m4tu4g) +* [extractor/zee5] Generate device ids by [freezboltz](https://github.com/freezboltz) + + ### 2022.09.01 * Add option `--use-extractors` diff --git a/supportedsites.md b/supportedsites.md index 7b1e72016b..48888f61fa 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -3,11 +3,12 @@ # Supported sites - **0000studio:clip** - **17live** - **17live:clip** + - **1News**: 1news.co.nz article videos - **1tv**: Первый канал - - **20.detik.com** - **20min** - **23video** - **247sports** + - **24tv.ua** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -134,6 +135,7 @@ # Supported sites - **BehindKink** - **Bellator** - **BellMedia** + - **BerufeTV** - **Bet** - **bfi:player** - **bfmtv** @@ -147,9 +149,11 @@ # Supported sites - **Bilibili category extractor** - **BilibiliAudio** - **BilibiliAudioAlbum** - - **BilibiliChannel** - **BiliBiliPlayer** - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix + - **BilibiliSpaceAudio** + - **BilibiliSpacePlaylist** + - **BilibiliSpaceVideo** - **BiliIntl**: [<abbr title="netrc machine"><em>biliintl</em></abbr>] - **BiliIntlSeries**: [<abbr title="netrc machine"><em>biliintl</em></abbr>] - **BiliLive** @@ -167,6 +171,7 @@ # Supported sites - **Bloomberg** - **BokeCC** - **BongaCams** + - **BooyahClips** - **BostonGlobe** - **Box** - **Bpb**: Bundeszentrale für politische Bildung @@ -179,6 +184,7 @@ # Supported sites - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **Bundesliga** - **BusinessInsider** - **BuzzFeed** - **BYUtv** @@ -247,6 +253,7 @@ # Supported sites - **CNN** - **CNNArticle** - **CNNBlogs** + - **CNNIndonesia** - **ComedyCentral** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED @@ -303,6 +310,7 @@ # Supported sites - **defense.gouv.fr** - **democracynow** - **DestinationAmerica** + - **DetikEmbed** - **DHM**: Filmarchiv - Deutsches Historisches Museum - **Digg** - **DigitalConcertHall**: [<abbr title="netrc machine"><em>digitalconcerthall</em></abbr>] DigitalConcertHall extractor @@ -478,6 +486,7 @@ # Supported sites - **google:podcasts:feed** - **GoogleDrive** - **GoogleDrive:Folder** + - **GoPlay**: [<abbr title="netrc machine"><em>goplay</em></abbr>] - **GoPro** - **Goshgay** - **GoToStage** @@ -527,11 +536,14 @@ # Supported sites - **Hypem** - **Hytale** - **Icareus** + - **iflix:episode** + - **IflixSeries** - **ign.com** - **IGNArticle** - **IGNVideo** - **IHeartRadio** - **iheartradio:podcast** + - **Iltalehti** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** @@ -556,6 +568,7 @@ # Supported sites - **iqiyi**: [<abbr title="netrc machine"><em>iqiyi</em></abbr>] 爱奇艺 - **IslamChannel** - **IslamChannelSeries** + - **IsraelNationalNews** - **ITProTV** - **ITProTVCourse** - **ITTF** @@ -688,6 +701,7 @@ # Supported sites - **Mediasite** - **MediasiteCatalog** - **MediasiteNamedCatalog** + - **MediaWorksNZVOD** - **Medici** - **megaphone.fm**: megaphone.fm embedded players - **megatvcom**: megatv.com videos @@ -700,6 +714,7 @@ # Supported sites - **mewatch** - **Mgoon** - **MiaoPai** + - **MicrosoftEmbed** - **microsoftstream**: Microsoft Stream - **mildom**: Record ongoing live by specific user in Mildom - **mildom:clip**: Clip in Mildom @@ -799,6 +814,7 @@ # Supported sites - **NBCSports** - **NBCSportsStream** - **NBCSportsVPlayer** + - **NBCStations** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** - **ndr:embed:base** @@ -833,8 +849,8 @@ # Supported sites - **NexxEmbed** - **NFB** - **NFHSNetwork** - - **nfl.com**: (**Currently broken**) - - **nfl.com:article**: (**Currently broken**) + - **nfl.com** + - **nfl.com:article** - **NhkForSchoolBangumi** - **NhkForSchoolProgramList** - **NhkForSchoolSubject**: Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学) @@ -1012,6 +1028,7 @@ # Supported sites - **PornoVoisines** - **PornoXO** - **PornTube** + - **PrankCast** - **PremiershipRugby** - **PressTV** - **ProjectVeritas** @@ -1192,6 +1209,7 @@ # Supported sites - **Slideshare** - **SlidesLive** - **Slutload** + - **Smotrim** - **Snotr** - **Sohu** - **SonyLIV**: [<abbr title="netrc machine"><em>sonyliv</em></abbr>] @@ -1221,8 +1239,8 @@ # Supported sites - **Sport5** - **SportBox** - **SportDeutschland** - - **spotify**: Spotify episodes - - **spotify:show**: Spotify shows + - **spotify**: Spotify episodes (**Currently broken**) + - **spotify:show**: Spotify shows (**Currently broken**) - **Spreaker** - **SpreakerPage** - **SpreakerShow** @@ -1316,10 +1334,10 @@ # Supported sites - **ThreeSpeak** - **ThreeSpeakUser** - **TikTok** - - **tiktok:effect** - - **tiktok:sound** - - **tiktok:tag** - - **tiktok:user** + - **tiktok:effect**: (**Currently broken**) + - **tiktok:sound**: (**Currently broken**) + - **tiktok:tag**: (**Currently broken**) + - **tiktok:user**: (**Currently broken**) - **tinypic**: tinypic.com videos - **TLC** - **TMZ** @@ -1360,6 +1378,7 @@ # Supported sites - **Turbo** - **tv.dfb.de** - **TV2** + - **TV24UAGenericPassthrough** - **TV2Article** - **TV2DK** - **TV2DKBornholmPlay** @@ -1422,6 +1441,7 @@ # Supported sites - **umg:de**: Universal Music Deutschland - **Unistra** - **Unity** + - **UnscriptedNewsVideo** - **uol.com.br** - **uplynk** - **uplynk:preplay** @@ -1466,8 +1486,6 @@ # Supported sites - **VidioLive**: [<abbr title="netrc machine"><em>vidio</em></abbr>] - **VidioPremier**: [<abbr title="netrc machine"><em>vidio</em></abbr>] - **VidLii** - - **vier**: [<abbr title="netrc machine"><em>vier</em></abbr>] vier.be and vijf.be - - **vier:videos** - **viewlift** - **viewlift:embed** - **Viidea** @@ -1563,8 +1581,10 @@ # Supported sites - **Willow** - **WimTV** - **Wistia** + - **WistiaChannel** - **WistiaPlaylist** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **wordpress:playlist** - **WorldStarHipHop** - **wppilot** - **wppilot:channels** @@ -1628,6 +1648,7 @@ # Supported sites - **youtube:search**: YouTube search; "ytsearch:" prefix - **youtube:search:date**: YouTube search, newest videos first; "ytsearchdate:" prefix - **youtube:search_url**: YouTube search URLs with sorting and filter support + - **youtube:shorts:pivot:audio**: YouTube Shorts audio pivot (Shorts using audio of a given video) - **youtube:stories**: YouTube channel stories; "ytstories:" prefix - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) - **youtube:tab**: YouTube Tabs From 57fb88093ea08108f3118b69bc56353625b34c5c Mon Sep 17 00:00:00 2001 From: github-actions <github-actions@example.com> Date: Tue, 4 Oct 2022 04:50:32 +0000 Subject: [PATCH 235/284] [version] update Created by: pukkandan :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++++---- .github/ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++++---- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++++---- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++++---- yt_dlp/version.py | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index af0320569c..c4bad101b3 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.10.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -62,7 +62,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.10.04 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -70,8 +70,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.09.01, Current version: 2022.09.01 - yt-dlp is up to date (2022.09.01) + Latest version: 2022.10.04, Current version: 2022.10.04 + yt-dlp is up to date (2022.10.04) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 55ee9d3b7e..6cbdc8ee89 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.10.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -74,7 +74,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.10.04 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -82,8 +82,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.09.01, Current version: 2022.09.01 - yt-dlp is up to date (2022.09.01) + Latest version: 2022.10.04, Current version: 2022.10.04 + yt-dlp is up to date (2022.10.04) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index a3a786e387..15101e885f 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.10.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -70,7 +70,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.10.04 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -78,8 +78,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.09.01, Current version: 2022.09.01 - yt-dlp is up to date (2022.09.01) + Latest version: 2022.10.04, Current version: 2022.10.04 + yt-dlp is up to date (2022.10.04) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 4613fd35d1..aa03087cf3 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.10.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -55,7 +55,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.10.04 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -63,8 +63,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.09.01, Current version: 2022.09.01 - yt-dlp is up to date (2022.09.01) + Latest version: 2022.10.04, Current version: 2022.10.04 + yt-dlp is up to date (2022.10.04) <more lines> render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 0eaee4441b..47f6644a4f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.10.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -51,7 +51,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.10.04 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -59,7 +59,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.09.01, Current version: 2022.09.01 - yt-dlp is up to date (2022.09.01) + Latest version: 2022.10.04, Current version: 2022.10.04 + yt-dlp is up to date (2022.10.04) <more lines> render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index acfbeb74b9..996f90679f 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.09.01** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2022.10.04** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2022.09.01 [9d339c4] (win32_exe) + [debug] yt-dlp version 2022.10.04 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,7 +65,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2022.09.01, Current version: 2022.09.01 - yt-dlp is up to date (2022.09.01) + Latest version: 2022.10.04, Current version: 2022.10.04 + yt-dlp is up to date (2022.10.04) <more lines> render: shell diff --git a/yt_dlp/version.py b/yt_dlp/version.py index ac7a825eae..1123205bdc 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2022.09.01' +__version__ = '2022.10.04' -RELEASE_GIT_HEAD = '5d7c7d656' +RELEASE_GIT_HEAD = '4e0511f27' VARIANT = None From 1305b659ef2bf3c76851b9400c7ac4a8f100fce2 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 4 Oct 2022 10:31:49 +0530 Subject: [PATCH 236/284] [extractor/detik] Avoid unnecessary extraction --- yt_dlp/extractor/detik.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/detik.py b/yt_dlp/extractor/detik.py index 7ee6f2746a..7209e66118 100644 --- a/yt_dlp/extractor/detik.py +++ b/yt_dlp/extractor/detik.py @@ -114,18 +114,15 @@ class DetikEmbedIE(InfoExtractor): }] def _extract_from_webpage(self, url, webpage): - display_id = url_basename(url) player_type, video_data = self._search_regex( r'<script\s*[^>]+src="https?://(aws)?cdn\.detik\.net\.id/(?P<type>flowplayer|detikVideo)[^>]+>\s*(?P<video_data>{[^}]+})', webpage, 'playerjs', group=('type', 'video_data'), default=(None, '')) - - json_ld_data = self._search_json_ld(webpage, display_id, default={}) - extra_info_dict = {} - if not player_type: return - elif player_type == 'flowplayer': + display_id, extra_info_dict = url_basename(url), {} + + if player_type == 'flowplayer': video_json_data = self._parse_json(video_data.replace('\'', '"'), display_id) video_url = video_json_data['videoUrl'] @@ -151,6 +148,7 @@ def _extract_from_webpage(self, url, webpage): formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) self._sort_formats(formats) + json_ld_data = self._search_json_ld(webpage, display_id, default={}) yield merge_dicts(json_ld_data, extra_info_dict, { 'display_id': display_id, 'title': self._html_search_meta(['og:title', 'originalTitle'], webpage) or self._html_extract_title(webpage), From 98d4ec1ef287cc5655ce6afd7b17755c57a245cb Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 4 Oct 2022 23:02:12 +0530 Subject: [PATCH 237/284] [build] Pin `py2exe` version Workaround for #5135 --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 45c5a43ccc..2b4e2f46b1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -193,7 +193,7 @@ jobs: python-version: '3.8' - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds - python -m pip install --upgrade pip setuptools wheel py2exe + python -m pip install --upgrade pip setuptools wheel "py2exe<0.12" pip install "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.3-py3-none-any.whl" -r requirements.txt - name: Prepare From bf2e1ec67a5cdaa9039e91cd39c1f670649068a8 Mon Sep 17 00:00:00 2001 From: invertico <8355966+invertico@users.noreply.github.com> Date: Tue, 4 Oct 2022 20:22:07 +0200 Subject: [PATCH 238/284] [extractor/livestreamfails] Support posts (#5139) Authored by: invertico --- yt_dlp/extractor/livestreamfails.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/livestreamfails.py b/yt_dlp/extractor/livestreamfails.py index d6f626a99c..0df638422c 100644 --- a/yt_dlp/extractor/livestreamfails.py +++ b/yt_dlp/extractor/livestreamfails.py @@ -3,7 +3,7 @@ class LivestreamfailsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?livestreamfails\.com/clip/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?livestreamfails\.com/(?:clip|post)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://livestreamfails.com/clip/139200', 'md5': '8a03aea1a46e94a05af6410337463102', @@ -17,6 +17,9 @@ class LivestreamfailsIE(InfoExtractor): 'timestamp': 1656271785, 'upload_date': '20220626', } + }, { + 'url': 'https://livestreamfails.com/post/139200', + 'only_matching': True, }] def _real_extract(self, url): From aebb4f4ba78ec7542416832e9dd5e47788cb12aa Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 5 Oct 2022 09:15:22 +0530 Subject: [PATCH 239/284] Fix for formats=None Fixes: https://github.com/yt-dlp/yt-dlp/pull/4965#issuecomment-1267682512 --- yt_dlp/YoutubeDL.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 53681149e1..e1c24b8925 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2525,11 +2525,7 @@ def sanitize_numeric_fields(info): info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) - if info_dict.get('formats') is None: - # There's only one format available - formats = [info_dict] - else: - formats = info_dict['formats'] + formats = self._get_formats(info_dict) # or None ensures --clean-infojson removes it info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None @@ -2644,7 +2640,7 @@ def is_wellformed(f): info_dict, _ = self.pre_process(info_dict, 'after_filter') # The pre-processors may have modified the formats - formats = info_dict.get('formats', [info_dict]) + formats = self._get_formats(info_dict) list_only = self.params.get('simulate') is None and ( self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) @@ -3571,11 +3567,17 @@ def _format_note(self, fdict): res += '~' + format_bytes(fdict['filesize_approx']) return res - def render_formats_table(self, info_dict): - if not info_dict.get('formats') and not info_dict.get('url'): - return None + def _get_formats(self, info_dict): + if info_dict.get('formats') is None: + if info_dict.get('url') and info_dict.get('_type', 'video') == 'video': + return [info_dict] + return [] + return info_dict['formats'] - formats = info_dict.get('formats', [info_dict]) + def render_formats_table(self, info_dict): + formats = self._get_formats(info_dict) + if not formats: + return if not self.params.get('listformats_table', True) is not False: table = [ [ From 09c127ff838505de1bddde56ad4d22f46ebf6ed7 Mon Sep 17 00:00:00 2001 From: Sergey <SG5@users.noreply.github.com> Date: Wed, 5 Oct 2022 20:54:41 -0700 Subject: [PATCH 240/284] [extractor/Tnaflix] Fix for HTTP 500 (#5150) Closes #5107 Authored by: SG5 --- yt_dlp/extractor/tnaflix.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/tnaflix.py b/yt_dlp/extractor/tnaflix.py index 34361e515a..8cbfeb7fba 100644 --- a/yt_dlp/extractor/tnaflix.py +++ b/yt_dlp/extractor/tnaflix.py @@ -19,6 +19,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] _HOST = 'tna' + _VIDEO_XML_URL = 'https://www.tnaflix.com/cdn/cdn.php?file={}.fid&key={}&VID={}&nomp4=1&catID=0&rollover=1&startThumb=12&embed=0&utm_source=0&multiview=0&premium=1&country=0user=0&vip=1&cd=0&ref=0&alpha' _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' @@ -71,6 +72,10 @@ def get_child(elem, names): def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') + + def extract_field(pattern, name): + return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None + for display_id_key in ('display_id', 'display_id_2'): if display_id_key in mobj.groupdict(): display_id = mobj.group(display_id_key) @@ -85,6 +90,13 @@ def _real_extract(self, url): self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, group='url'), 'http:') + if not cfg_url: + vkey = extract_field(r'<input\b[^>]+\bid="vkey"\b[^>]+\bvalue="([^"]+)"', 'vkey') + nkey = extract_field(r'<input\b[^>]+\bid="nkey"\b[^>]+\bvalue="([^"]+)"', 'nkey') + vid = extract_field(r'<input\b[^>]+\bid="VID"\b[^>]+\bvalue="([^"]+)"', 'vid') + if vkey and nkey and vid: + cfg_url = self._proto_relative_url(self._VIDEO_XML_URL.format(vkey, nkey, vid), 'http:') + if not cfg_url: inputs = self._hidden_inputs(webpage) cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' @@ -139,9 +151,6 @@ def extract_video_url(vl): duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', default=None)) - def extract_field(pattern, name): - return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None - description = extract_field(self._DESCRIPTION_REGEX, 'description') uploader = extract_field(self._UPLOADER_REGEX, 'uploader') view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) From f03940963ed02f0e4a99afaa2673a4329741c420 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 6 Oct 2022 05:10:54 +0000 Subject: [PATCH 241/284] [extractor/dplay] Add MotorTrendOnDemand extractor (#5151) Closes #5141 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/dplay.py | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 44c189f797..2b603f4f25 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -440,6 +440,7 @@ AnimalPlanetIE, TLCIE, MotorTrendIE, + MotorTrendOnDemandIE, DiscoveryPlusIndiaIE, DiscoveryNetworksDeIE, DiscoveryPlusItalyIE, diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index e7629a5e16..3f0b315a57 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -745,6 +745,45 @@ class MotorTrendIE(DiscoveryPlusBaseIE): } +class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', + 'info_dict': { + 'id': '37699', + 'display_id': 'wheelstanding-dump-truck-stubby-bobs-comeback/37699', + 'ext': 'mp4', + 'title': 'Wheelstanding Dump Truck! Stubby Bob’s Comeback', + 'description': 'md5:996915abe52a1c3dfc83aecea3cce8e7', + 'season_number': 5, + 'episode_number': 52, + 'episode': 'Episode 52', + 'season': 'Season 5', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'timestamp': 1388534401, + 'duration': 1887.345, + 'creator': 'Originals', + 'series': 'Roadkill', + 'upload_date': '20140101', + 'tags': [], + }, + }] + + _PRODUCT = 'MTOD' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.motortrendondemand.com', + 'realm': 'motortrend', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:4.39.1-gi1', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class DiscoveryPlusIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ From 867c66ff97b0639485a2b6ebc28f2e0df0bf8187 Mon Sep 17 00:00:00 2001 From: Matthew <coletdjnz@protonmail.com> Date: Fri, 7 Oct 2022 20:00:40 +1300 Subject: [PATCH 242/284] [extractor/youtube] Extract concurrent view count for livestreams (#5152) Adds new field `concurrent_view_count` Closes https://github.com/yt-dlp/yt-dlp/issues/4843 Authored by: coletdjnz --- README.md | 1 + yt_dlp/extractor/common.py | 1 + yt_dlp/extractor/youtube.py | 27 +++++++++++++++++++-------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e0a1ea059b..9b59e096a9 100644 --- a/README.md +++ b/README.md @@ -1226,6 +1226,7 @@ # OUTPUT TEMPLATE - `duration` (numeric): Length of the video in seconds - `duration_string` (string): Length of the video (HH:mm:ss) - `view_count` (numeric): How many users have watched the video on the platform + - `concurrent_view_count` (numeric): How many users are currently watching the video on the platform. - `like_count` (numeric): Number of positive ratings of the video - `dislike_count` (numeric): Number of negative ratings of the video - `repost_count` (numeric): Number of reposts of the video diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 944b196a11..31a45b37a2 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -284,6 +284,7 @@ class InfoExtractor: captions instead of normal subtitles duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. + concurrent_view_count: How many users are currently watching the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video repost_count: Number of reposts of the video diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4456110f6c..6f153bb3cf 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -912,8 +912,7 @@ def _extract_video(self, renderer): traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), video_id, default=None, group='duration')) - view_count = self._get_count(renderer, 'viewCountText') - + view_count = self._get_count(renderer, 'viewCountText', 'shortViewCountText') uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') channel_id = traverse_obj( renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), @@ -932,6 +931,12 @@ def _extract_video(self, renderer): if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: url = f'https://www.youtube.com/shorts/{video_id}' + live_status = ( + 'is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) + else None) + return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -940,17 +945,12 @@ def _extract_video(self, renderer): 'title': title, 'description': description, 'duration': duration, - 'view_count': view_count, 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, 'upload_date': (strftime_or_none(self._parse_time_text(time_text), '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None), - 'live_status': ('is_upcoming' if scheduled_timestamp is not None - else 'was_live' if 'streamed' in time_text.lower() - else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) - else None), 'release_timestamp': scheduled_timestamp, 'availability': 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) @@ -958,7 +958,8 @@ def _extract_video(self, renderer): is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None, needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, - is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None) + is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), + 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count': view_count, } @@ -2328,6 +2329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'playable_in_embed': True, 'description': 'md5:2ef1d002cad520f65825346e2084e49d', + 'concurrent_view_count': int, }, 'params': {'skip_download': True} }, { @@ -4115,6 +4117,15 @@ def process_language(container, base_url, lang_code, sub_name, query): 'like_count': str_to_int(like_count), 'dislike_count': str_to_int(dislike_count), }) + vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer')) + if vcr: + vc = self._get_count(vcr, 'viewCount') + # Upcoming premieres with waiting count are treated as live here + if vcr.get('isLive'): + info['concurrent_view_count'] = vc + elif info.get('view_count') is None: + info['view_count'] = vc + vsir = get_first(contents, 'videoSecondaryInfoRenderer') if vsir: vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer')) From e02e6d86dbca8852a8f1df934b8f4a30552060d2 Mon Sep 17 00:00:00 2001 From: Noah <10456231+How-Bout-No@users.noreply.github.com> Date: Fri, 7 Oct 2022 08:04:27 -0400 Subject: [PATCH 243/284] [embedthumbnail] Fix thumbnail name in mp3 (#5163) Authored by: How-Bout-No --- yt_dlp/postprocessor/embedthumbnail.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 9ae59a7c31..b02d9d499d 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -92,7 +92,7 @@ def run(self, info): if info['ext'] == 'mp3': options = [ '-c', 'copy', '-map', '0:0', '-map', '1:0', '-write_id3v1', '1', '-id3v2_version', '3', - '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (front)"'] + '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment=Cover (front)'] self._report_run('ffmpeg', filename) self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) From 2e565f5bcacd2ab25bb57160313048b398afab4c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 7 Oct 2022 12:10:12 +0000 Subject: [PATCH 244/284] [extractor/reddit] Add fallback format (#5165) Closes #5160 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index aabc8dba9b..c713b24fed 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -36,6 +36,26 @@ class RedditIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # 1080p fallback format + 'url': 'https://www.reddit.com/r/aww/comments/90bu6w/heat_index_was_110_degrees_so_we_offered_him_a/', + 'md5': '8b5902cfda3006bf90faea7adf765a49', + 'info_dict': { + 'id': 'gyh95hiqc0b11', + 'ext': 'mp4', + 'display_id': '90bu6w', + 'title': 'Heat index was 110 degrees so we offered him a cold drink. He went for a full body soak instead', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:7', + 'timestamp': 1532051078, + 'upload_date': '20180720', + 'uploader': 'FootLoosePickleJuice', + 'duration': 14, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, @@ -145,9 +165,18 @@ def add_thumbnail(src): dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' - formats = self._extract_m3u8_formats( - hls_playlist_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats = [{ + 'url': unescapeHTML(reddit_video['fallback_url']), + 'height': int_or_none(reddit_video.get('height')), + 'width': int_or_none(reddit_video.get('width')), + 'tbr': int_or_none(reddit_video.get('bitrate_kbps')), + 'acodec': 'none', + 'ext': 'mp4', + 'format_id': 'fallback', + 'format_note': 'DASH video, mp4_dash', + }] + formats.extend(self._extract_m3u8_formats( + hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) formats.extend(self._extract_mpd_formats( dash_playlist_url, display_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) From 3b55aaac596e7a08730439eb8cac4e240f4b250b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 7 Oct 2022 20:35:46 +0000 Subject: [PATCH 245/284] [extractor/tubitv] Better DRM detection (#5171) Closes #5128 Authored by: bashonly --- yt_dlp/extractor/tubitv.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index d91a46500c..f5ed950be6 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -22,6 +22,19 @@ class TubiTvIE(InfoExtractor): _NETRC_MACHINE = 'tubitv' _GEO_COUNTRIES = ['US'] _TESTS = [{ + 'url': 'https://tubitv.com/movies/383676/tracker', + 'md5': '566fa0f76870302d11af0de89511d3f0', + 'info_dict': { + 'id': '383676', + 'ext': 'mp4', + 'title': 'Tracker', + 'description': 'md5:ff320baf43d0ad2655e538c1d5cd9706', + 'uploader_id': 'f866e2677ea2f0dff719788e4f7f9195', + 'release_year': 2010, + 'thumbnail': r're:^https?://.+\.(jpe?g|png)$', + 'duration': 6122, + }, + }, { 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', 'md5': '43ac06be9326f41912dc64ccf7a80320', 'info_dict': { @@ -31,12 +44,10 @@ class TubiTvIE(InfoExtractor): 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', }, + 'skip': 'Content Unavailable' }, { 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories', 'only_matching': True, - }, { - 'url': 'http://tubitv.com/movies/383676/tracker', - 'only_matching': True, }, { 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', 'info_dict': { @@ -47,11 +58,13 @@ class TubiTvIE(InfoExtractor): 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', 'release_year': 1979, }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Content Unavailable' }] + # DRM formats are included only to raise appropriate error + _UNPLAYABLE_FORMATS = ('hlsv6_widevine', 'hlsv6_widevine_nonclearlead', 'hlsv6_playready_psshv0', + 'hlsv6_fairplay', 'dash_widevine', 'dash_widevine_nonclearlead') + def _perform_login(self, username, password): self.report_login() form_data = { @@ -69,17 +82,26 @@ def _perform_login(self, username, password): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://tubitv.com/oz/videos/%s/content?video_resources=dash&video_resources=hlsv3&video_resources=hlsv6' % video_id, video_id) + video_data = self._download_json(f'https://tubitv.com/oz/videos/{video_id}/content', video_id, query={ + 'video_resources': ['dash', 'hlsv3', 'hlsv6', *self._UNPLAYABLE_FORMATS], + }) title = video_data['title'] formats = [] + drm_formats = False for resource in video_data['video_resources']: if resource['type'] in ('dash', ): formats += self._extract_mpd_formats(resource['manifest']['url'], video_id, mpd_id=resource['type'], fatal=False) elif resource['type'] in ('hlsv3', 'hlsv6'): formats += self._extract_m3u8_formats(resource['manifest']['url'], video_id, 'mp4', m3u8_id=resource['type'], fatal=False) + elif resource['type'] in self._UNPLAYABLE_FORMATS: + drm_formats = True + + if not formats and drm_formats: + self.report_drm(video_id) + elif not formats and not video_data.get('policy_match'): # policy_match is False if content was removed + raise ExtractorError('This content is currently unavailable', expected=True) self._sort_formats(formats) From f99bbfc9838d98d81027dddb18ace0af66acdf6d Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Sun, 9 Oct 2022 03:27:32 +0200 Subject: [PATCH 246/284] [utils] `traverse_obj`: Always return list when branching (#5170) Fixes #5162 Authored by: Grub4K --- test/test_utils.py | 27 +++++++++++++++++++++++---- yt_dlp/utils.py | 22 ++++++++++++++-------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 69313564a1..6f3f6cb914 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1890,6 +1890,7 @@ def test_traverse_obj(self): {'index': 2}, {'index': 3}, ), + 'dict': {}, } # Test base functionality @@ -1926,11 +1927,15 @@ def test_traverse_obj(self): # Test alternative paths self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', - msg='multiple `path_list` should be treated as alternative paths') + msg='multiple `paths` should be treated as alternative paths') self.assertEqual(traverse_obj(_TEST_DATA, 'str', 100), 'str', msg='alternatives should exit early') self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'fail'), None, msg='alternatives should return `default` if exhausted') + self.assertEqual(traverse_obj(_TEST_DATA, (..., 'fail'), 100), 100, + msg='alternatives should track their own branching return') + self.assertEqual(traverse_obj(_TEST_DATA, ('dict', ...), ('data', ...)), list(_TEST_DATA['data']), + msg='alternatives on empty objects should search further') # Test branch and path nesting self.assertEqual(traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')), ['https://www.example.com/0'], @@ -1963,8 +1968,16 @@ def test_traverse_obj(self): self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}), {0: ['https://www.example.com/1', 'https://www.example.com/0']}, msg='tripple nesting in dict path should be treated as branches') - self.assertEqual(traverse_obj({}, {0: 1}, default=...), {0: ...}, - msg='do not remove `None` values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {}, + msg='remove `None` values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=...), {0: ...}, + msg='do not remove `None` values if `default`') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {0: {}}, + msg='do not remove empty values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=...), {0: {}}, + msg='do not remove empty values when dict key and a default') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', ...)}), {0: []}, + msg='if branch in dict key not successful, return `[]`') # Testing default parameter behavior _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} @@ -1981,7 +1994,13 @@ def test_traverse_obj(self): self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', 10)), None, msg='`IndexError` should result in `default`') self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=1), 1, - msg='if branched but not successfull return `default`, not `[]`') + msg='if branched but not successful return `default` if defined, not `[]`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=None), None, + msg='if branched but not successful return `default` even if `default` is `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail')), [], + msg='if branched but not successful return `[]`, not `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', ...)), [], + msg='if branched but object is empty return `[]`, not `default`') # Testing expected_type behavior _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d0be7f19ef..7d8e971626 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5294,7 +5294,7 @@ def load_plugins(name, suffix, namespace): def traverse_obj( - obj, *paths, default=None, expected_type=None, get_all=True, + obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, casesense=True, is_user_input=False, traverse_string=False): """ Safely traverse nested `dict`s and `Sequence`s @@ -5304,6 +5304,7 @@ def traverse_obj( "value" Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. A value of None is treated as the absence of a value. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. @@ -5342,6 +5343,7 @@ def traverse_obj( @returns The result of the object traversal. If successful, `get_all=True`, and the path branches at least once, then a list of results is returned instead. + A list is always returned if the last path branches and no `default` is given. """ is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes)) casefold = lambda k: k.casefold() if isinstance(k, str) else k @@ -5385,7 +5387,7 @@ def apply_key(key, obj): elif isinstance(key, dict): iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items()) yield {k: v if v is not None else default for k, v in iter_obj - if v is not None or default is not None} + if v is not None or default is not NO_DEFAULT} elif isinstance(obj, dict): yield (obj.get(key) if casesense or (key in obj) @@ -5426,18 +5428,22 @@ def apply_path(start_obj, path): return has_branched, objs - def _traverse_obj(obj, path): + def _traverse_obj(obj, path, use_list=True): has_branched, results = apply_path(obj, path) results = LazyList(x for x in map(type_test, results) if x is not None) - if results: - return results.exhaust() if get_all and has_branched else results[0] - for path in paths: - result = _traverse_obj(obj, path) + if get_all and has_branched: + return results.exhaust() if results or use_list else None + + return results[0] if results else None + + for index, path in enumerate(paths, 1): + use_list = default is NO_DEFAULT and index == len(paths) + result = _traverse_obj(obj, path, use_list) if result is not None: return result - return default + return None if default is NO_DEFAULT else default def traverse_dict(dictn, keys, casesense=True): From 7b0127e1e11186bcbb80a18b1b530d864a5dbada Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Sun, 9 Oct 2022 03:31:37 +0200 Subject: [PATCH 247/284] [utils] `traverse_obj`: Allow `re.Match` objects (#5174) Authored by: Grub4K --- test/test_utils.py | 20 ++++++++++++++++++++ yt_dlp/utils.py | 22 +++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 6f3f6cb914..90085a9c0d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2,6 +2,7 @@ # Allow direct execution import os +import re import sys import unittest @@ -2080,6 +2081,25 @@ def test_traverse_obj(self): with self.assertRaises(TypeError, msg='too many params should result in error'): traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':::'), is_user_input=True) + # Test re.Match as input obj + mobj = re.fullmatch(r'0(12)(?P<group>3)(4)?', '0123') + self.assertEqual(traverse_obj(mobj, ...), [x for x in mobj.groups() if x is not None], + msg='`...` on a `re.Match` should give its `groups()`') + self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 2)), ['0123', '3'], + msg='function on a `re.Match` should give groupno, value starting at 0') + self.assertEqual(traverse_obj(mobj, 'group'), '3', + msg='str key on a `re.Match` should give group with that name') + self.assertEqual(traverse_obj(mobj, 2), '3', + msg='int key on a `re.Match` should give group with that name') + self.assertEqual(traverse_obj(mobj, 'gRoUp', casesense=False), '3', + msg='str key on a `re.Match` should respect casesense') + self.assertEqual(traverse_obj(mobj, 'fail'), None, + msg='failing str key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, 'gRoUpS', casesense=False), None, + msg='failing str key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, 8), None, + msg='failing int key on a `re.Match` should return `default`') + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 7d8e971626..cb14908c7f 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5305,13 +5305,14 @@ def traverse_obj( Each of the provided `paths` is tested and the first producing a valid result will be returned. The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Sequence` and `re.Match`. A value of None is treated as the absence of a value. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. The keys in the path can be one of: - `None`: Return the current object. - - `str`/`int`: Return `obj[key]`. + - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`. - `slice`: Branch out and return all values in `obj[key]`. - `Ellipsis`: Branch out and return a list of all values. - `tuple`/`list`: Branch out and return a list of all matching values. @@ -5322,7 +5323,7 @@ def traverse_obj( - `dict` Transform the current object and return a matching dict. Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - `tuple`, `list`, and `dict` all support nested paths and branches + `tuple`, `list`, and `dict` all support nested paths and branches. @params paths Paths which to traverse by. @param default Value to return if the paths do not match. @@ -5370,6 +5371,8 @@ def apply_key(key, obj): yield from obj.values() elif is_sequence(obj): yield from obj + elif isinstance(obj, re.Match): + yield from obj.groups() elif traverse_string: yield from str(obj) @@ -5378,6 +5381,8 @@ def apply_key(key, obj): iter_obj = enumerate(obj) elif isinstance(obj, collections.abc.Mapping): iter_obj = obj.items() + elif isinstance(obj, re.Match): + iter_obj = enumerate((obj.group(), *obj.groups())) elif traverse_string: iter_obj = enumerate(str(obj)) else: @@ -5389,10 +5394,21 @@ def apply_key(key, obj): yield {k: v if v is not None else default for k, v in iter_obj if v is not None or default is not NO_DEFAULT} - elif isinstance(obj, dict): + elif isinstance(obj, collections.abc.Mapping): yield (obj.get(key) if casesense or (key in obj) else next((v for k, v in obj.items() if casefold(k) == key), None)) + elif isinstance(obj, re.Match): + if isinstance(key, int) or casesense: + with contextlib.suppress(IndexError): + yield obj.group(key) + return + + if not isinstance(key, str): + return + + yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + else: if is_user_input: key = (int_or_none(key) if ':' not in key From 540236ce11a133675a3a9ea9b373155274fab550 Mon Sep 17 00:00:00 2001 From: Teemu Ikonen <tpikonen@gmail.com> Date: Sun, 9 Oct 2022 04:34:22 +0300 Subject: [PATCH 248/284] [extractor/screen9] Add extractor (#5137) Authored by: tpikonen --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/screen9.py | 63 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 yt_dlp/extractor/screen9.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2b603f4f25..06be8f8229 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1565,6 +1565,7 @@ from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE +from .screen9 import Screen9IE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE from .scrippsnetworks import ( diff --git a/yt_dlp/extractor/screen9.py b/yt_dlp/extractor/screen9.py new file mode 100644 index 0000000000..eae652af7d --- /dev/null +++ b/yt_dlp/extractor/screen9.py @@ -0,0 +1,63 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class Screen9IE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.screen9\.(?:tv|com)|play\.su\.se)/(?:embed|media)/(?P<id>[^?#/]+)' + _TESTS = [ + { + 'url': 'https://api.screen9.com/embed/8kTNEjvoXGM33dmWwF0uDA', + 'md5': 'd60d23f8980583b930724b01fa6ddb41', + 'info_dict': { + 'id': '8kTNEjvoXGM33dmWwF0uDA', + 'ext': 'mp4', + 'title': 'Östersjön i förändrat klimat', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + { + 'url': 'https://folkhogskolekanalen.screen9.tv/media/gy35PKLHe-5K29RYHga2bw/ett-starkare-samhalle-en-snabbguide-om-sveriges-folkhogskolor', + 'md5': 'c9389806e78573ea34fc48b6f94465dc', + 'info_dict': { + 'id': 'gy35PKLHe-5K29RYHga2bw', + 'ext': 'mp4', + 'title': 'Ett starkare samhälle - en snabbguide om Sveriges folkhögskolor', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + { + 'url': 'https://play.su.se/media/H1YA0EYNCxiesrSU1kaRBQ/baltic-breakfast', + 'md5': '2b817647c3058002526269deff4c0683', + 'info_dict': { + 'id': 'H1YA0EYNCxiesrSU1kaRBQ', + 'ext': 'mp4', + 'title': 'Baltic Breakfast', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://api.screen9.com/embed/{video_id}', video_id) + config = self._search_json(r'var\s+config\s*=', webpage, 'config', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(config, ('src', lambda _, v: v['type'] == 'application/x-mpegURL', 'src'), get_all=False), + video_id, ext='mp4') + formats.append({ + 'url': traverse_obj(config, ('src', lambda _, v: v['type'] == 'video/mp4', 'src'), get_all=False), + 'format': 'mp4', + }) + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': traverse_obj( + config, + ('plugins', (('title', 'title'), ('googleAnalytics', 'title'), ('share', 'mediaTitle'))), + get_all=False), + 'description': traverse_obj(config, ('plugins', 'title', 'description')), + 'thumbnail': traverse_obj(config, ('poster')), + 'formats': formats, + 'subtitles': subtitles, + } From 5d14b734918c2c1230cd103d013d54ff194617f7 Mon Sep 17 00:00:00 2001 From: Marenga <107524538+the-marenga@users.noreply.github.com> Date: Sun, 9 Oct 2022 03:50:44 +0200 Subject: [PATCH 249/284] [VK] Fix playlist URLs (#4930) Closes #2825 Authored by: the-marenga --- yt_dlp/extractor/vk.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 69f518b698..0c856e2b0a 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -536,7 +536,7 @@ def _real_extract(self, url): class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/@(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/(?:playlist/)?(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'https://vk.com/video/@mobidevices', @@ -550,6 +550,13 @@ class VKUserVideosIE(VKBaseIE): 'id': '-17892518_uploaded', }, 'playlist_mincount': 182, + }, { + 'url': 'https://vk.com/video/playlist/-174476437_2', + 'info_dict': { + 'id': '-174476437_2', + 'title': 'Анонсы' + }, + 'playlist_mincount': 108, }] _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) @@ -584,11 +591,19 @@ def _entries(self, page_id, section): def _real_extract(self, url): u_id, section = self._match_valid_url(url).groups() webpage = self._download_webpage(url, u_id) - page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') + + if u_id.startswith('@'): + page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') + elif '_' in u_id: + page_id, section = u_id.split('_', 1) + else: + raise ExtractorError('Invalid URL', expected=True) + if not section: section = 'all' - return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section)) + playlist_title = clean_html(get_element_by_class('VideoInfoPanel__title', webpage)) + return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section), playlist_title) class VKWallPostIE(VKBaseIE): From 866f0373445472ce7ff70da3572b2f178dcece85 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Sun, 9 Oct 2022 11:32:58 +0900 Subject: [PATCH 250/284] [extractor/nos.nl] Add extractor (#4822) Closes #4649 Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/nosnl.py | 95 +++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 yt_dlp/extractor/nosnl.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 06be8f8229..75cb3fcab9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1181,6 +1181,7 @@ from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nosnl import NOSNLArticleIE from .nova import ( NovaEmbedIE, NovaIE, diff --git a/yt_dlp/extractor/nosnl.py b/yt_dlp/extractor/nosnl.py new file mode 100644 index 0000000000..eba94c416d --- /dev/null +++ b/yt_dlp/extractor/nosnl.py @@ -0,0 +1,95 @@ +from .common import InfoExtractor +from ..utils import parse_duration, parse_iso8601, traverse_obj + + +class NOSNLArticleIE(InfoExtractor): + _VALID_URL = r'https?://nos\.nl/((?!video)(\w+/)?\w+/)\d+-(?P<display_id>[\w-]+)' + _TESTS = [ + { + # only 1 video + 'url': 'https://nos.nl/nieuwsuur/artikel/2440353-verzakking-door-droogte-dreigt-tot-een-miljoen-kwetsbare-huizen', + 'info_dict': { + 'id': '2440340', + 'ext': 'mp4', + 'description': 'md5:5f83185d902ac97af3af4bed7ece3db5', + 'title': '\'We hebben een huis vol met scheuren\'', + 'duration': 95.0, + 'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg', + } + }, { + # more than 1 video + 'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten', + 'info_dict': { + 'id': '2440409', + 'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten', + 'description': 'Er werd wel geprobeerd om kwetsbare migranten onderdak te bieden, zegt het COA.', + 'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'], + 'modified_timestamp': 1660452773, + 'modified_date': '20220814', + 'upload_date': '20220813', + 'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg', + 'timestamp': 1660401384, + }, + 'playlist_count': 2, + }, { + # audio + video + 'url': 'https://nos.nl/artikel/2440789-wekdienst-16-8-groningse-acties-tien-jaar-na-zware-aardbeving-femke-bol-in-actie-op-ek-atletiek', + 'info_dict': { + 'id': '2440789', + 'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ', + 'description': 'Nieuws, weer, verkeer: met dit overzicht begin je geïnformeerd aan de dag.', + 'tags': ['wekdienst'], + 'modified_date': '20220816', + 'modified_timestamp': 1660625449, + 'timestamp': 1660625449, + 'upload_date': '20220816', + 'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg', + }, + 'playlist_count': 2, + } + ] + + def _entries(self, nextjs_json, display_id): + for item in nextjs_json['items']: + if item.get('type') == 'video': + formats, subtitle = self._extract_m3u8_formats_and_subtitles( + traverse_obj(item, ('source', 'url')), display_id, ext='mp4') + yield { + 'id': str(item['id']), + 'title': item.get('title'), + 'description': item.get('description'), + 'formats': formats, + 'subtitles': subtitle, + 'duration': parse_duration(item.get('duration')), + 'thumbnails': [{ + 'url': traverse_obj(image, ('url', ...), get_all=False), + 'width': image.get('width'), + 'height': image.get('height') + } for image in traverse_obj(item, ('imagesByRatio', ...))[0]], + } + + elif item.get('type') == 'audio': + yield { + 'id': str(item['id']), + 'title': item.get('title'), + 'url': traverse_obj(item, ('media', 'src')), + 'ext': 'mp3', + } + + def _real_extract(self, url): + display_id = self._match_valid_url(url).group('display_id') + webpage = self._download_webpage(url, display_id) + + nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] + return { + '_type': 'playlist', + 'entries': self._entries(nextjs_json, display_id), + 'id': str(nextjs_json['id']), + 'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage), + 'description': (nextjs_json.get('description') + or self._html_search_meta(['description', 'twitter:description', 'og:description'], webpage)), + 'tags': nextjs_json.get('keywords'), + 'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')), + 'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage), + 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')) + } From f324fe8c590d3f4737cfd8b5a41eaa60edc546dc Mon Sep 17 00:00:00 2001 From: tkgmomosheep <8298025+tkgmomosheep@users.noreply.github.com> Date: Sun, 9 Oct 2022 10:34:12 +0800 Subject: [PATCH 251/284] [extractor/viu] Support subtitles of on-screen text (#5173) Authored by: tkgmomosheep --- yt_dlp/extractor/viu.py | 42 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 63b6fd3a12..d27091c94e 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -164,12 +164,17 @@ class ViuOTTIE(InfoExtractor): }, 'skip': 'Geo-restricted to Singapore', }, { - 'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90', + 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/430078/%E7%AC%AC%E5%85%AD%E6%84%9F-3', 'info_dict': { - 'id': '7123', + 'id': '430078', 'ext': 'mp4', - 'title': '這就是我的生活之道', - 'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f', + 'title': '大韓民國的1%', + 'description': 'md5:74d6db47ddd9ddb9c89a05739103ccdb', + 'episode_number': 1, + 'duration': 6614, + 'episode': '大韓民國的1%', + 'series': '第六感 3', + 'thumbnail': 'https://d2anahhhmp1ffz.cloudfront.net/1313295781/d2b14f48d008ef2f3a9200c98d8e9b63967b9cc2', }, 'params': { 'skip_download': 'm3u8 download', @@ -177,11 +182,12 @@ class ViuOTTIE(InfoExtractor): }, 'skip': 'Geo-restricted to Hong Kong', }, { - 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/68776/%E6%99%82%E5%B0%9A%E5%AA%BD%E5%92%AA', - 'playlist_count': 12, + 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/444666/%E6%88%91%E7%9A%84%E5%AE%A4%E5%8F%8B%E6%98%AF%E4%B9%9D%E5%B0%BE%E7%8B%90', + 'playlist_count': 16, 'info_dict': { - 'id': '3916', - 'title': '時尚媽咪', + 'id': '23807', + 'title': '我的室友是九尾狐', + 'description': 'md5:b42c95f2b4a316cdd6ae14ca695f33b9', }, 'params': { 'skip_download': 'm3u8 download', @@ -363,13 +369,19 @@ def download_playback(): subtitles = {} for sub in video_data.get('subtitle') or []: - sub_url = sub.get('url') - if not sub_url: - continue - subtitles.setdefault(sub.get('name'), []).append({ - 'url': sub_url, - 'ext': 'srt', - }) + lang = sub.get('name') or 'und' + if sub.get('url'): + subtitles.setdefault(lang, []).append({ + 'url': sub['url'], + 'ext': 'srt', + 'name': f'Spoken text for {lang}', + }) + if sub.get('second_subtitle_url'): + subtitles.setdefault(f'{lang}_ost', []).append({ + 'url': sub['second_subtitle_url'], + 'ext': 'srt', + 'name': f'On-screen text for {lang}', + }) title = strip_or_none(video_data.get('synopsis')) return { From 1d55ebabc93b8e422a0126fc307f2a8e50fa5a97 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Sun, 9 Oct 2022 05:17:58 +0200 Subject: [PATCH 252/284] [extractor/common] Fix `json_ld` type checks (#5145) Closes #5144, #5143 Authored by: Grub4K --- yt_dlp/extractor/common.py | 12 +++++------- yt_dlp/extractor/generic.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 31a45b37a2..18a52a8559 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1467,10 +1467,6 @@ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if not json_ld: return {} info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] INTERACTION_TYPE_MAP = { 'CommentAction': 'comment', @@ -1570,11 +1566,13 @@ def extract_video_object(e): extract_chapter_information(e) def traverse_json_ld(json_ld, at_top_level=True): - for e in json_ld: + for e in variadic(json_ld): + if not isinstance(e, dict): + continue if at_top_level and '@context' not in e: continue if at_top_level and set(e.keys()) == {'@context', '@graph'}: - traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + traverse_json_ld(e['@graph'], at_top_level=False) break if expected_type is not None and not is_type(e, expected_type): continue @@ -1629,8 +1627,8 @@ def traverse_json_ld(json_ld, at_top_level=True): continue else: break - traverse_json_ld(json_ld) + traverse_json_ld(json_ld) return filter_dict(info) def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 73422f937c..92390a3875 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2463,6 +2463,21 @@ class GenericIE(InfoExtractor): 'duration': 111.0, } }, + { + 'note': 'JSON LD with unexpected data type', + 'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/', + 'info_dict': { + 'id': 'porsche-911-gt3-rs-rij-impressie-2', + 'ext': 'mp4', + 'title': 'Test: Porsche 911 GT3 RS', + 'description': 'Je ziet het niet, maar het is er wel. Downforce, hebben we het dan over. En in de nieuwe Porsche 911 GT3 RS is er zelfs heel veel downforce.', + 'timestamp': 1664920902, + 'upload_date': '20221004', + 'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$', + 'age_limit': 0, + 'direct': True, + } + } ] def report_following_redirect(self, new_url): From 4c9a1a3ba56c2906f9ef8d768de7f8e5a2361144 Mon Sep 17 00:00:00 2001 From: Matthew <coletdjnz@protonmail.com> Date: Sun, 9 Oct 2022 18:55:26 +1300 Subject: [PATCH 253/284] [extractor/wordpress:mb.miniAudioPlayer] Add embed extractor (#5087) Closes https://github.com/yt-dlp/yt-dlp/issues/4994 Authored by: coletdjnz --- test/test_utils.py | 3 ++ yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/wordpress.py | 85 +++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 4 +- 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 90085a9c0d..df23f1f47b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1679,6 +1679,9 @@ def test_get_elements_text_and_html_by_attribute(self): self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), []) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), []) + self.assertEqual(list(get_elements_text_and_html_by_attribute( + 'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a')), [('nice', '<a class="foo">nice</a>')]) + GET_ELEMENT_BY_TAG_TEST_STRING = ''' random text lorem ipsum</p> <div> diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 75cb3fcab9..e5be357164 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2165,7 +2165,10 @@ WistiaPlaylistIE, WistiaChannelIE, ) -from .wordpress import WordpressPlaylistEmbedIE +from .wordpress import ( + WordpressPlaylistEmbedIE, + WordpressMiniAudioPlayerEmbedIE, +) from .worldstarhiphop import WorldStarHipHopIE from .wppilot import ( WPPilotIE, diff --git a/yt_dlp/extractor/wordpress.py b/yt_dlp/extractor/wordpress.py index e90ae6c1e1..53820b57a9 100644 --- a/yt_dlp/extractor/wordpress.py +++ b/yt_dlp/extractor/wordpress.py @@ -1,6 +1,10 @@ +import re + from .common import InfoExtractor from ..utils import ( + extract_attributes, get_elements_by_class, + get_elements_text_and_html_by_attribute, int_or_none, parse_duration, traverse_obj, @@ -67,3 +71,84 @@ def _extract_from_webpage(self, url, webpage): 'width': int_or_none(traverse_obj(track, ('dimensions', 'original', 'width'))), } for track in traverse_obj(playlist_json, ('tracks', ...), expected_type=dict)] yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i+1}', 'Wordpress Playlist') + + +class WordpressMiniAudioPlayerEmbedIE(InfoExtractor): + # WordPress MB Mini Player Plugin + # https://wordpress.org/plugins/wp-miniaudioplayer/ + # Note: This is for the WordPress plugin version only. + _VALID_URL = False + IE_NAME = 'wordpress:mb.miniAudioPlayer' + _WEBPAGE_TESTS = [{ + # Version 1.8.10: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.8.10 + 'url': 'https://news.samsung.com/global/over-the-horizon-the-evolution-of-the-samsung-galaxy-brand-sound', + 'info_dict': { + 'id': 'over-the-horizon-the-evolution-of-the-samsung-galaxy-brand-sound', + 'title': 'Over the Horizon: The Evolution of the Samsung Galaxy Brand Sound', + 'age_limit': 0, + 'thumbnail': 'https://img.global.news.samsung.com/global/wp-content/uploads/2015/04/OTH_Main_Title-e1429612467870.jpg', + 'description': 'md5:bc3dd738d1f11d9232e94e6629983bf7', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'over_the_horizon_2013', + 'ext': 'mp3', + 'title': 'Over the Horizon 2013', + 'url': 'http://news.samsung.com/global/wp-content/uploads/ringtones/over_the_horizon_2013.mp3' + } + }], + 'playlist_count': 6, + 'params': {'skip_download': True} + }, { + # Version 1.9.3: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.9.3 + 'url': 'https://www.booksontape.com/collections/audiobooks-with-teacher-guides/', + 'info_dict': { + 'id': 'audiobooks-with-teacher-guides', + 'title': 'Audiobooks with Teacher Guides | Books on Tape', + 'age_limit': 0, + 'thumbnail': 'https://www.booksontape.com/wp-content/uploads/2016/09/bot-logo-1200x630.jpg', + }, + 'playlist_mincount': 12 + }, { + # Version 1.9.7: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.9.7 + # But has spaces around href filter + 'url': 'https://www.estudiords.com.br/temas/', + 'info_dict': { + 'id': 'temas', + 'title': 'Temas Variados', + 'age_limit': 0, + 'timestamp': float, + 'upload_date': str, + 'thumbnail': 'https://www.estudiords.com.br/wp-content/uploads/2021/03/LOGO-TEMAS.png', + 'description': 'md5:ab24d6a7ed0312ad2d466e721679f5a0', + }, + 'playlist_mincount': 30 + }] + + def _extract_from_webpage(self, url, webpage): + # Common function for the WordPress plugin version only. + mb_player_params = self._search_regex( + r'function\s*initializeMiniAudioPlayer\(\){[^}]+jQuery([^;]+)\.mb_miniPlayer', + webpage, 'mb player params', default=None) + if not mb_player_params: + return + # v1.55 - 1.9.3 has "a[href*='.mp3'] ,a[href*='.m4a']" + # v1.9.4+ has "a[href*='.mp3']" only + file_exts = re.findall(r'a\[href\s*\*=\s*\'\.([a-zA-Z\d]+)\'', mb_player_params) + if not file_exts: + return + + candidates = get_elements_text_and_html_by_attribute( + 'href', rf'(?:[^\"\']+\.(?:{"|".join(file_exts)}))', webpage, escape_value=False, tag='a') + + for title, html in candidates: + attrs = extract_attributes(html) + # XXX: not tested - have not found any example of it being used + if any(c in (attrs.get('class') or '') for c in re.findall(r'\.not\("\.([^"]+)', mb_player_params)): + continue + href = attrs['href'] + yield { + 'id': self._generic_id(href), + 'title': title or self._generic_title(href), + 'url': href, + } diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index cb14908c7f..5a88a928d6 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -408,7 +408,7 @@ def get_elements_html_by_attribute(*args, **kwargs): return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] -def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True): +def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): """ Return the text (content) and the html (whole) of the tag with the specified attribute in the passed HTML document @@ -419,7 +419,7 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value value = re.escape(value) if escape_value else value partial_element_re = rf'''(?x) - <(?P<tag>[a-zA-Z0-9:._-]+) + <(?P<tag>{tag}) (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) ''' From ade1fa70cbaaaadaa4772e5f0564870cea3167ef Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 9 Oct 2022 16:09:36 +0530 Subject: [PATCH 254/284] [extractor/generic] Separate embed extraction into own function (#5176) --- yt_dlp/extractor/common.py | 7 +++ yt_dlp/extractor/generic.py | 104 ++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 59 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 18a52a8559..10d44d95a7 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -66,6 +66,7 @@ sanitize_filename, sanitize_url, sanitized_Request, + smuggle_url, str_or_none, str_to_int, strip_or_none, @@ -3873,6 +3874,12 @@ def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True): def RetryManager(self, **kwargs): return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs) + def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs): + display_id = traverse_obj(info_dict, 'display_id', 'id') + self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}') + return self._downloader.get_info_extractor('Generic')._extract_embeds( + smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs) + @classmethod def extract_from_webpage(cls, ydl, url, webpage): ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 92390a3875..ad4e3c5b87 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1,5 +1,6 @@ import os import re +import types import urllib.parse import xml.etree.ElementTree @@ -2609,6 +2610,7 @@ def _real_extract(self, url): default_search += ':' return self.url_result(default_search + url) + original_url = url url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None is_intentional = smuggled_data.get('to_generic') @@ -2760,7 +2762,20 @@ def _real_extract(self, url): 'age_limit': self._rta_search(webpage), }) - domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) + self._downloader.write_debug('Looking for embeds') + embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) + if len(embeds) == 1: + return {**info_dict, **embeds[0]} + elif embeds: + return self.playlist_result(embeds, **info_dict) + raise UnsupportedError(url) + + def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): + """Returns an iterator of video entries""" + info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation + video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + actual_url = urlh.geturl() if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) @@ -2776,31 +2791,19 @@ def _real_extract(self, url): lambda x: unescapeHTML(x.group(0)), webpage) # TODO: Move to respective extractors - self._downloader.write_debug('Looking for Brightcove embeds') bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': info_dict['title'], - 'id': video_id, - 'entries': entries, - } + return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveLegacyIE) + for bc_url in bc_urls] bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage) if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, info_dict['title'], - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') + return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveNewIE) + for bc_url in bc_urls] - self._downloader.write_debug('Looking for embeds') embeds = [] for ie in self._downloader._ies.values(): + if ie.ie_key() in smuggled_data.get('block_ies', []): + continue gen = ie.extract_from_webpage(self._downloader, url, webpage) current_embeds = [] try: @@ -2809,35 +2812,26 @@ def _real_extract(self, url): except self.StopExtraction: self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), embeds and 'discarding other embeds') - embeds = current_embeds - break + return current_embeds except StopIteration: self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) embeds.extend(current_embeds) - del current_embeds - if len(embeds) == 1: - return {**info_dict, **embeds[0]} - elif embeds: - return self.playlist_result(embeds, **info_dict) + if embeds: + return embeds jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): self.report_detected('JW Player playlist') - return { - **info_dict, - '_type': 'url', - 'ie_key': 'JWPlatform', - 'url': jwplayer_data['playlist'], - } + return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')] try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) if traverse_obj(info, 'formats', ('entries', ..., 'formats')): self.report_detected('JW Player data') - return merge_dicts(info, info_dict) + return [info] except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass @@ -2865,7 +2859,7 @@ def _real_extract(self, url): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) + return [self.url_result(src, YoutubeIE.ie_key())] if src_type == 'application/dash+xml' or ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( src, video_id, mpd_id='dash', fatal=False) @@ -2883,7 +2877,7 @@ def _real_extract(self, url): 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) # https://docs.videojs.com/player#addRemoteTextTrack @@ -2898,28 +2892,26 @@ def _real_extract(self, url): 'url': urllib.parse.urljoin(url, src), 'name': sub.get('label'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) if formats or subtitles: self.report_detected('video.js embed') self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles - return info_dict + return [{'formats': formats, 'subtitles': subtitles}] # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - return merge_dicts({ + return [merge_dicts({ '_type': 'video' if json_ld.get('ext') else 'url_transparent', 'url': smuggle_url(json_ld['url'], { 'force_videoid': video_id, 'to_generic': True, 'http_headers': {'Referer': url}, }), - }, json_ld, info_dict) + }, json_ld)] def check_video(vurl): if YoutubeIE.suitable(vurl): @@ -2990,13 +2982,13 @@ def filter_video(urls): self._sort_formats(formats) - return { + return [{ 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, - } + }] if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) @@ -3050,17 +3042,14 @@ def filter_video(urls): webpage) if not found: # Look also in Refresh HTTP header - refresh_header = full_response.headers.get('Refresh') + refresh_header = urlh and urlh.headers.get('Refresh') if refresh_header: found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) if new_url != url: self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + return [self.url_result(new_url)] else: found = None @@ -3071,10 +3060,12 @@ def filter_video(urls): embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: self.report_detected('twitter:player iframe') - return self.url_result(embed_url) + return [self.url_result(embed_url)] if not found: - raise UnsupportedError(url) + return [] + + domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) entries = [] for video_url in orderedSet(found): @@ -3090,7 +3081,7 @@ def filter_video(urls): video_id = os.path.splitext(video_id)[0] headers = { - 'referer': full_response.geturl() + 'referer': actual_url } entry_info_dict = { @@ -3114,7 +3105,7 @@ def filter_video(urls): if ext == 'smil': entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) + return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) elif ext == 'mpd': @@ -3144,14 +3135,9 @@ def filter_video(urls): entries.append(entry_info_dict) - if len(entries) == 1: - return merge_dicts(entries[0], info_dict) - else: + if len(entries) > 1: for num, e in enumerate(entries, start=1): # 'url' results don't have a title if e.get('title') is not None: e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } + return entries From 226c0f3a54faef19e2d2729d0072e7df43a7250b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 10 Oct 2022 20:28:55 +0000 Subject: [PATCH 255/284] [extractor/sbs] Improve `_VALID_URL` (#5193) Closes #5045 Authored by: bashonly --- yt_dlp/extractor/sbs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 6bb499930c..45320339da 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -12,6 +12,7 @@ class SBSIE(InfoExtractor): ondemand(?: /video/(?:single/)?| /movie/[^/]+/| + /(?:tv|news)-series/(?:[^/]+/){3}| .*?\bplay=|/watch/ )|news/(?:embeds/)?video/ )(?P<id>[0-9]+)''' @@ -63,6 +64,12 @@ class SBSIE(InfoExtractor): 'note': 'Live stream', 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/news-series/dateline/dateline-2022/dateline-s2022-ep26/2072245827515', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776', + 'only_matching': True, }] def _real_extract(self, url): From 2c98d998181c81ee49908be03c031204fd66d03d Mon Sep 17 00:00:00 2001 From: schnusch <schnusch@users.noreply.github.com> Date: Mon, 10 Oct 2022 22:31:01 +0200 Subject: [PATCH 256/284] [extractors/podbayfm] Add extractor (#4971) Authored by: schnusch --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/podbayfm.py | 75 +++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 3 +- 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/podbayfm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e5be357164..d514f9a894 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1345,6 +1345,7 @@ PluralsightIE, PluralsightCourseIE, ) +from .podbayfm import PodbayFMIE, PodbayFMChannelIE from .podchaser import PodchaserIE from .podomatic import PodomaticIE from .pokemon import ( diff --git a/yt_dlp/extractor/podbayfm.py b/yt_dlp/extractor/podbayfm.py new file mode 100644 index 0000000000..2a26fd2b36 --- /dev/null +++ b/yt_dlp/extractor/podbayfm.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call + + +def result_from_props(props, episode_id=None): + return { + 'id': props.get('podcast_id') or episode_id, + 'title': props.get('title'), + 'url': props['mediaURL'], + 'ext': 'mp3', + 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']), + 'timestamp': props.get('timestamp'), + 'duration': int_or_none(props.get('duration')), + } + + +class PodbayFMIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400', + 'md5': '98b41285dcf7989d105a4ed0404054cf', + 'info_dict': { + 'id': '1647338400', + 'title': 'Part One: Kissinger', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1647338400, + 'duration': 5001, + 'upload_date': '20220315', + }, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + data = self._search_nextjs_data(webpage, episode_id) + return result_from_props(data['props']['pageProps']['episode'], episode_id) + + +class PodbayFMChannelIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards', + 'info_dict': { + 'id': 'behind-the-bastards', + 'title': 'Behind the Bastards', + }, + }] + _PAGE_SIZE = 10 + + def _fetch_page(self, channel_id, pagenum): + return self._download_json( + f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}', + channel_id)['podcast'] + + @staticmethod + def _results_from_page(channel_id, page): + return [{ + **result_from_props(e), + 'extractor': PodbayFMIE.IE_NAME, + 'extractor_key': PodbayFMIE.ie_key(), + # somehow they use timestamps as the episode identifier + 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}', + } for e in page['episodes']] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + first_page = self._fetch_page(channel_id, 0) + entries = OnDemandPagedList( + lambda pagenum: self._results_from_page( + channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page), + self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id, first_page.get('title')) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 5a88a928d6..c2327ae1d0 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5499,7 +5499,8 @@ def jwt_encode_hs256(payload_data, key, headers={}): # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256 def jwt_decode_hs256(jwt): header_b64, payload_b64, signature_b64 = jwt.split('.') - payload_data = json.loads(base64.urlsafe_b64decode(payload_b64)) + # add trailing ='s that may have been stripped, superfluous ='s are ignored + payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}===')) return payload_data From d509c1f5a347d0247593f116fa5cad2ff4f9a3de Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Sun, 9 Oct 2022 04:18:28 +0530 Subject: [PATCH 257/284] [utils] `strftime_or_none`: Workaround Python bug on Windows CLoses #5185 --- yt_dlp/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index c2327ae1d0..6cfbcdb8db 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2574,7 +2574,9 @@ def strftime_or_none(timestamp, date_format, default=None): datetime_object = None try: if isinstance(timestamp, (int, float)): # unix timestamp - datetime_object = datetime.datetime.utcfromtimestamp(timestamp) + # Using naive datetime here can break timestamp() in Windows + # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414 + datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) elif isinstance(timestamp, str): # assume YYYYMMDD datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') date_format = re.sub( # Support %s on windows From 0468a3b3253957bfbeb98b4a7c71542ff80e9e06 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Tue, 11 Oct 2022 07:59:27 +0530 Subject: [PATCH 258/284] [jsinterp] Improve separating regex Fixes https://github.com/yt-dlp/yt-dlp/issues/4635#issuecomment-1273974909 --- test/test_jsinterp.py | 5 +++++ test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 2 +- yt_dlp/jsinterp.py | 6 ++++-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 92ef532f56..3c4391c4ab 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -392,6 +392,11 @@ def test_regex(self): ''') self.assertEqual(jsi.call_function('x').pattern, r',][}",],()}(\[)') + jsi = JSInterpreter(R''' + function x() { let a=[/[)\\]/]; return a[0]; } + ''') + self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + def test_char_code_at(self): jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('x', 0), 116) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index c3dcb4d68f..6d753fbf09 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -130,6 +130,10 @@ 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', ), + ( + 'https://www.youtube.com/s/player/7a062b77/player_ias.vflset/en_US/base.js', + 'NRcE3y3mVtm_cV-W', 'VbsCYUATvqlt5w', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 6f153bb3cf..35e41753a2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2832,7 +2832,7 @@ def _decrypt_nsig(self, s, video_id, player_url): self.report_warning( f'Native nsig extraction failed: Trying with PhantomJS\n' f' n = {s} ; player = {player_url}', video_id) - self.write_debug(e) + self.write_debug(e, only_once=True) args, func_body = func_code ret = jsi.execute( diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 4caad6f743..e25997129d 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -236,7 +236,7 @@ def _regex_flags(cls, expr): @staticmethod def _separate(expr, delim=',', max_split=None): - OP_CHARS = '+-*/%&|^=<>!,;{}:' + OP_CHARS = '+-*/%&|^=<>!,;{}:[' if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} @@ -246,7 +246,9 @@ def _separate(expr, delim=',', max_split=None): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 elif not in_quote and char in counters: - counters[char] -= 1 + # Something's wrong if we get negative, but ignore it anyway + if counters[char]: + counters[char] -= 1 elif not escaping: if char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': From 36069409ec7ed88f7571f29ff35a5a4c62b70cfc Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Tue, 11 Oct 2022 05:39:12 +0200 Subject: [PATCH 259/284] [cookies] Improve `LenientSimpleCookie` (#5195) Closes #5186 Authored by: Grub4K --- test/test_cookies.py | 15 +++++++++++++++ yt_dlp/cookies.py | 30 +++++++++++++----------------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/test/test_cookies.py b/test/test_cookies.py index 61619df297..4155bcbf55 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -277,9 +277,24 @@ def test_lenient_parsing(self): "a=b; invalid; Version=1; c=d", {"a": "b", "c": "d"}, ), + ( + "Reset morsel after invalid to not capture attributes", + "a=b; $invalid; $Version=1; c=d", + {"a": "b", "c": "d"}, + ), ( "Continue after non-flag attribute without value", "a=b; path; Version=1; c=d", {"a": "b", "c": "d"}, ), + ( + "Allow cookie attributes with `$` prefix", + 'Customer="WILE_E_COYOTE"; $Version=1; $Secure; $Path=/acme', + {"Customer": ("WILE_E_COYOTE", {"version": "1", "secure": True, "path": "/acme"})}, + ), + ( + "Invalid Morsel keys should not result in an error", + "Key=Value; [Invalid]=Value; Another=Value", + {"Key": "Value", "Another": "Value"}, + ), ) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 3032d07122..8ca7cea2ce 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -999,8 +999,9 @@ def _parse_browser_specification(browser_name, profile=None, keyring=None, conta class LenientSimpleCookie(http.cookies.SimpleCookie): """More lenient version of http.cookies.SimpleCookie""" # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py - _LEGAL_KEY_CHARS = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=" - _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + r"\[\]" + # We use Morsel's legal key chars to avoid errors on setting values + _LEGAL_KEY_CHARS = r'\w\d' + re.escape('!#$%&\'*+-.:^_`|~') + _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + re.escape('(),/<=>?@[]{}') _RESERVED = { "expires", @@ -1046,25 +1047,17 @@ def load(self, data): return super().load(data) morsel = None - index = 0 - length = len(data) - - while 0 <= index < length: - match = self._COOKIE_PATTERN.search(data, index) - if not match: - break - - index = match.end(0) - if match.group("bad"): + for match in self._COOKIE_PATTERN.finditer(data): + if match.group('bad'): morsel = None continue - key, value = match.group("key", "val") + key, value = match.group('key', 'val') - if key[0] == "$": - if morsel is not None: - morsel[key[1:]] = True - continue + is_attribute = False + if key.startswith('$'): + key = key[1:] + is_attribute = True lower_key = key.lower() if lower_key in self._RESERVED: @@ -1081,6 +1074,9 @@ def load(self, data): morsel[key] = value + elif is_attribute: + morsel = None + elif value is not None: morsel = self.get(key, http.cookies.Morsel()) real_value, coded_value = self.value_decode(value) From 13b2ae29c2056c5306c3b735e801e9b091a33739 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Tue, 11 Oct 2022 07:54:38 +0200 Subject: [PATCH 260/284] [extractor/twitter] Support multi-video posts (#5183) Closes #5157, Closes #5147 Authored by: Grub4K --- yt_dlp/extractor/twitter.py | 319 +++++++++++++++++++++++++----------- 1 file changed, 225 insertions(+), 94 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index d516aafa28..771a58ab43 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,6 +1,7 @@ import re from .common import InfoExtractor +from .periscope import PeriscopeBaseIE, PeriscopeIE from ..compat import ( compat_HTTPError, compat_parse_qs, @@ -8,25 +9,22 @@ compat_urllib_parse_urlparse, ) from ..utils import ( - dict_get, ExtractorError, - format_field, + dict_get, float_or_none, + format_field, int_or_none, + make_archive_id, + str_or_none, + strip_or_none, traverse_obj, try_get, - strip_or_none, unified_timestamp, update_url_query, url_or_none, xpath_text, ) -from .periscope import ( - PeriscopeBaseIE, - PeriscopeIE, -) - class TwitterBaseIE(InfoExtractor): _API_BASE = 'https://api.twitter.com/1.1/' @@ -85,7 +83,7 @@ def _search_dimensions_in_video_url(a_format, video_url): def _call_api(self, path, video_id, query={}): headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', } token = self._get_cookies(self._API_BASE).get('ct0') if token: @@ -202,7 +200,8 @@ class TwitterIE(TwitterBaseIE): _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'info_dict': { - 'id': '643211948184596480', + 'id': '643211870443208704', + 'display_id': '643211948184596480', 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': r're:^https?://.*\.jpg', @@ -213,6 +212,12 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1442188653, 'upload_date': '20150913', 'age_limit': 18, + 'uploader_url': 'https://twitter.com/freethenipple', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 18, }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -232,6 +237,7 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/starwars/status/665052190608723968', 'info_dict': { 'id': '665052190608723968', + 'display_id': '665052190608723968', 'ext': 'mp4', 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', @@ -239,6 +245,12 @@ class TwitterIE(TwitterBaseIE): 'uploader': 'Star Wars', 'timestamp': 1447395772, 'upload_date': '20151113', + 'uploader_url': 'https://twitter.com/starwars', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['TV', 'StarWars', 'TheForceAwakens'], + 'age_limit': 0, }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', @@ -251,6 +263,12 @@ class TwitterIE(TwitterBaseIE): 'uploader': 'Brent Yarina', 'timestamp': 1456976204, 'upload_date': '20160303', + 'uploader_url': 'https://twitter.com/BTNBrentYarina', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, 'params': { # The same video as https://twitter.com/i/videos/tweet/705235433198714880 @@ -260,16 +278,23 @@ class TwitterIE(TwitterBaseIE): }, { 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', 'info_dict': { - 'id': '700207533655363584', + 'id': '700207414000242688', + 'display_id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'jaydin donte geer - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vertugo', - 'uploader_id': 'simonvertugo', + 'uploader': 'jaydin donte geer', + 'uploader_id': 'jaydingeer', 'duration': 30.0, 'timestamp': 1455777459, 'upload_date': '20160218', + 'uploader_url': 'https://twitter.com/jaydingeer', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['Damndaniel'], + 'age_limit': 0, }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -282,12 +307,19 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1004126642786242560', 'timestamp': 1402826626, 'upload_date': '20140615', + 'thumbnail': r're:^https?://.*\.jpg', + 'alt_title': 'Vine by TAKUMA', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'view_count': int, }, 'add_ie': ['Vine'], }, { 'url': 'https://twitter.com/captainamerica/status/719944021058060289', 'info_dict': { - 'id': '719944021058060289', + 'id': '717462543795523584', + 'display_id': '719944021058060289', 'ext': 'mp4', 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', @@ -296,6 +328,13 @@ class TwitterIE(TwitterBaseIE): 'duration': 3.17, 'timestamp': 1460483005, 'upload_date': '20160412', + 'uploader_url': 'https://twitter.com/CaptainAmerica', + 'thumbnail': r're:^https?://.*\.jpg', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', @@ -307,6 +346,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1PmKqpJdOJQoY', 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'timestamp': 1474613214, + 'thumbnail': r're:^https?://.*\.jpg', }, 'add_ie': ['Periscope'], }, { @@ -327,7 +367,8 @@ class TwitterIE(TwitterBaseIE): }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { - 'id': '910031516746514432', + 'id': '910030238373089285', + 'display_id': '910031516746514432', 'ext': 'mp4', 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', 'thumbnail': r're:^https?://.*\.jpg', @@ -337,6 +378,12 @@ class TwitterIE(TwitterBaseIE): 'duration': 47.48, 'timestamp': 1505803395, 'upload_date': '20170919', + 'uploader_url': 'https://twitter.com/Prefet971', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['Maria'], + 'age_limit': 0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -345,7 +392,8 @@ class TwitterIE(TwitterBaseIE): # card via api.twitter.com/1.1/videos/tweet/config 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', 'info_dict': { - 'id': '1001551623938805763', + 'id': '1001551417340022785', + 'display_id': '1001551623938805763', 'ext': 'mp4', 'title': 're:.*?Shep is on a roll today.*?', 'thumbnail': r're:^https?://.*\.jpg', @@ -355,6 +403,12 @@ class TwitterIE(TwitterBaseIE): 'duration': 111.278, 'timestamp': 1527623489, 'upload_date': '20180529', + 'uploader_url': 'https://twitter.com/LisPower1', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -362,7 +416,8 @@ class TwitterIE(TwitterBaseIE): }, { 'url': 'https://twitter.com/foobar/status/1087791357756956680', 'info_dict': { - 'id': '1087791357756956680', + 'id': '1087791272830607360', + 'display_id': '1087791357756956680', 'ext': 'mp4', 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', @@ -372,6 +427,12 @@ class TwitterIE(TwitterBaseIE): 'duration': 61.567, 'timestamp': 1548184644, 'upload_date': '20190122', + 'uploader_url': 'https://twitter.com/Twitter', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, }, { # not available in Periscope @@ -382,13 +443,17 @@ class TwitterIE(TwitterBaseIE): 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', 'uploader': 'Vivi', 'uploader_id': '1eVjYOLGkGrQL', + 'thumbnail': r're:^https?://.*\.jpg', + 'tags': ['EduTECH2019'], + 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', 'info_dict': { - 'id': '1349794411333394432', + 'id': '1349774757969989634', + 'display_id': '1349794411333394432', 'ext': 'mp4', 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', 'thumbnail': r're:^https?://.*\.jpg', @@ -398,10 +463,57 @@ class TwitterIE(TwitterBaseIE): 'duration': 324.484, 'timestamp': 1610651040, 'upload_date': '20210114', + 'uploader_url': 'https://twitter.com/BrooklynNets', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, 'params': { 'skip_download': True, }, + }, { + 'url': 'https://twitter.com/oshtru/status/1577855540407197696', + 'info_dict': { + 'id': '1577855447914409984', + 'display_id': '1577855540407197696', + 'ext': 'mp4', + 'title': 'oshtru \U0001faac\U0001f47d - gm \u2728\ufe0f now I can post image and video. nice update.', + 'description': 'gm \u2728\ufe0f now I can post image and video. nice update. https://t.co/cG7XgiINOm', + 'upload_date': '20221006', + 'uploader': 'oshtru \U0001faac\U0001f47d', + 'uploader_id': 'oshtru', + 'uploader_url': 'https://twitter.com/oshtru', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 30.03, + 'timestamp': 1665025050.0, + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', + 'info_dict': { + 'id': '1577719286659006464', + 'title': 'Ultima | #\u0432\u029f\u043c - Test', + 'description': 'Test https://t.co/Y3KEZD7Dad', + 'uploader': 'Ultima | #\u0432\u029f\u043c', + 'uploader_id': 'UltimaShadowX', + 'uploader_url': 'https://twitter.com/UltimaShadowX', + 'upload_date': '20221005', + 'timestamp': 1664992565.0, + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, + }, + 'playlist_count': 4, + 'params': {'skip_download': True}, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', @@ -479,6 +591,8 @@ def _real_extract(self, url): } def extract_from_video_info(media): + media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) + self.write_debug(f'Extracting from video info: {media_id}') video_info = media.get('video_info') or {} formats = [] @@ -503,90 +617,107 @@ def add_thumbnail(name, size): add_thumbnail(name, size) add_thumbnail('orig', media.get('original_info') or {}) - info.update({ + return { + 'id': media_id, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), - }) + } - media = traverse_obj(status, ((None, 'quoted_status'), 'extended_entities', 'media', 0), get_all=False) - if media and media.get('type') != 'photo': - extract_from_video_info(media) - else: - card = status.get('card') - if card: - binding_values = card['binding_values'] + def extract_from_card_info(card): + if not card: + return - def get_binding_value(k): - o = binding_values.get(k) or {} - return try_get(o, lambda x: x[x['type'].lower() + '_value']) + self.write_debug(f'Extracting from card info: {card.get("url")}') + binding_values = card['binding_values'] - card_name = card['name'].split(':')[-1] - if card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - elif card_name == 'summary': - info.update({ - '_type': 'url', - 'url': get_binding_value('card_url'), - }) - elif card_name == 'unified_card': - media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] - extract_from_video_info(next(iter(media_entities.values()))) - # amplify, promo_video_website, promo_video_convo, appplayer, - # video_direct_message, poll2choice_video, poll3choice_video, - # poll4choice_video, ... - else: - is_amplify = card_name == 'amplify' - vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') - content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) - formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) - self._sort_formats(formats) + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) - thumbnails = [] - for suffix in ('_small', '', '_large', '_x_large', '_original'): - image = get_binding_value('player_image' + suffix) or {} - image_url = image.get('url') - if not image_url or '/player-placeholder' in image_url: - continue - thumbnails.append({ - 'id': suffix[1:] if suffix else 'medium', - 'url': image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) - - info.update({ - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'duration': int_or_none(get_binding_value( - 'content_duration_seconds')), - }) - else: - expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) - if not expanded_url: - raise ExtractorError("There's no video in this tweet.") - info.update({ + card_name = card['name'].split(':')[-1] + if card_name == 'player': + return { '_type': 'url', - 'url': expanded_url, - }) - return info + 'url': get_binding_value('player_url'), + } + elif card_name == 'periscope_broadcast': + return { + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + } + elif card_name == 'broadcast': + return { + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + } + elif card_name == 'summary': + return { + '_type': 'url', + 'url': get_binding_value('card_url'), + } + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + media = traverse_obj(media_entities, ..., expected_type=dict, get_all=False) + return extract_from_video_info(media) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... + else: + is_amplify = card_name == 'amplify' + vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') + content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) + formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) + self._sort_formats(formats) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + return { + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + } + + media_path = ((None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo') + videos = map(extract_from_video_info, traverse_obj(status, media_path, expected_type=dict)) + entries = [{**info, **data, 'display_id': twid} for data in videos if data] + + data = extract_from_card_info(status.get('card')) + if data: + entries.append({**info, **data, 'display_id': twid}) + + if not entries: + expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none) + if not expanded_url or expanded_url == url: + raise ExtractorError('No video could be found in this tweet', expected=True) + + return self.url_result(expanded_url, display_id=twid, **info) + + entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)] + + if len(entries) == 1: + return entries[0] + + for index, entry in enumerate(entries, 1): + entry['title'] += f' #{index}' + + return self.playlist_result(entries, **info) class TwitterAmplifyIE(TwitterBaseIE): From 82fb2357d90ace7a321f5c5fa55cd1a5bdb01578 Mon Sep 17 00:00:00 2001 From: sam <mail@samueljenks.me> Date: Wed, 12 Oct 2022 17:12:31 +1300 Subject: [PATCH 261/284] [extractor/twitter] Add onion site to `_VALID_URL` (#5208) See #3053 Authored by: DoubleCouponDay --- yt_dlp/extractor/twitter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 771a58ab43..f007454dc4 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -28,7 +28,7 @@ class TwitterBaseIE(InfoExtractor): _API_BASE = 'https://api.twitter.com/1.1/' - _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _GUEST_TOKEN = None def _extract_variant_formats(self, variant, video_id): @@ -514,6 +514,10 @@ class TwitterIE(TwitterBaseIE): }, 'playlist_count': 4, 'params': {'skip_download': True}, + }, { + # onion route + 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', + 'only_matching': True, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', From a79bf78397088fd6c3dde1f8370a030ab43b8b99 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan.ytdlp@gmail.com> Date: Wed, 12 Oct 2022 11:09:28 +0530 Subject: [PATCH 262/284] [extractor/tnaflix] Fix 09c127ff838505de1bddde56ad4d22f46ebf6ed7 Closes #5188 --- yt_dlp/extractor/tnaflix.py | 206 +++++++++++++++++++----------------- 1 file changed, 108 insertions(+), 98 deletions(-) diff --git a/yt_dlp/extractor/tnaflix.py b/yt_dlp/extractor/tnaflix.py index 8cbfeb7fba..eceaadb308 100644 --- a/yt_dlp/extractor/tnaflix.py +++ b/yt_dlp/extractor/tnaflix.py @@ -1,3 +1,5 @@ +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -7,6 +9,7 @@ parse_duration, str_to_int, unescapeHTML, + url_basename, xpath_text, ) @@ -18,9 +21,6 @@ class TNAFlixNetworkBaseIE(InfoExtractor): r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] - _HOST = 'tna' - _VIDEO_XML_URL = 'https://www.tnaflix.com/cdn/cdn.php?file={}.fid&key={}&VID={}&nomp4=1&catID=0&rollover=1&startThumb=12&embed=0&utm_source=0&multiview=0&premium=1&country=0user=0&vip=1&cd=0&ref=0&alpha' - _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' @@ -71,11 +71,7 @@ def get_child(elem, names): def _real_extract(self, url): mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - def extract_field(pattern, name): - return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None - + video_id, host = mobj.group('id', 'host') for display_id_key in ('display_id', 'display_id_2'): if display_id_key in mobj.groupdict(): display_id = mobj.group(display_id_key) @@ -86,122 +82,138 @@ def extract_field(pattern, name): webpage = self._download_webpage(url, display_id) + # check for MovieFap-style config cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, group='url'), 'http:') + query = {} - if not cfg_url: - vkey = extract_field(r'<input\b[^>]+\bid="vkey"\b[^>]+\bvalue="([^"]+)"', 'vkey') - nkey = extract_field(r'<input\b[^>]+\bid="nkey"\b[^>]+\bvalue="([^"]+)"', 'nkey') - vid = extract_field(r'<input\b[^>]+\bid="VID"\b[^>]+\bvalue="([^"]+)"', 'vid') - if vkey and nkey and vid: - cfg_url = self._proto_relative_url(self._VIDEO_XML_URL.format(vkey, nkey, vid), 'http:') - + # check for TNAFlix-style config if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' - % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) + if inputs.get('vkey') and inputs.get('nkey'): + cfg_url = f'https://www.{host}.com/cdn/cdn.php' + query.update({ + 'file': inputs['vkey'], + 'key': inputs['nkey'], + 'VID': video_id, + 'premium': '1', + 'vip': '1', + 'alpha': '', + }) - cfg_xml = self._download_xml( - cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands, headers={'Referer': url}) + formats, json_ld = [], {} - formats = [] + # TNAFlix and MovieFap extraction + if cfg_url: + cfg_xml = self._download_xml( + cfg_url, display_id, 'Downloading metadata', + transform_source=fix_xml_ampersands, headers={'Referer': url}, query=query) - def extract_video_url(vl): - # Any URL modification now results in HTTP Error 403: Forbidden - return unescapeHTML(vl.text) + def extract_video_url(vl): + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) - video_link = cfg_xml.find('./videoLink') - if video_link is not None: - formats.append({ - 'url': extract_video_url(video_link), - 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + video_link = cfg_xml.find('./videoLink') + if video_link is not None: + formats.append({ + 'url': extract_video_url(video_link), + 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + }) + + for item in cfg_xml.findall('./quality/item'): + video_link = item.find('./videoLink') + if video_link is None: + continue + res = item.find('res') + format_id = None if res is None else res.text + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), + 'format_id': format_id, + 'height': height, + }) + + thumbnails = self._extract_thumbnails(cfg_xml) or [] + thumbnails.append({ + 'url': self._proto_relative_url(xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') }) - for item in cfg_xml.findall('./quality/item'): - video_link = item.find('./videoLink') - if video_link is None: - continue - res = item.find('res') - format_id = None if res is None else res.text - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), - 'format_id': format_id, - 'height': height, - }) + # check for EMPFlix-style JSON and extract + else: + player = self._download_json( + f'http://www.{host}.com/ajax/video-player/{video_id}', video_id, + headers={'Referer': url}).get('html', '') + for mobj in re.finditer(r'<source src="(?P<src>[^"]+)"', player): + video_url = mobj.group('src') + height = self._search_regex(r'-(\d+)p\.', url_basename(video_url), 'height', default=None) + formats.append({ + 'url': self._proto_relative_url(video_url, 'http:'), + 'ext': url_basename(video_url).split('.')[-1], + 'height': int_or_none(height), + 'format_id': f'{height}p' if height else url_basename(video_url).split('.')[0], + }) + thumbnail = self._proto_relative_url(self._search_regex( + r'data-poster="([^"]+)"', player, 'thumbnail', default=None), 'http:') + thumbnails = [{'url': thumbnail}] if thumbnail else None + json_ld = self._search_json_ld(webpage, display_id, default={}) + + def extract_field(pattern, name): + return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None self._sort_formats(formats) - - thumbnail = self._proto_relative_url( - xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') - thumbnails = self._extract_thumbnails(cfg_xml) - - title = None - if self._TITLE_REGEX: - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title', default=None) - if not title: - title = self._og_search_title(webpage) - - age_limit = self._rta_search(webpage) or 18 - - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', default=None)) - - description = extract_field(self._DESCRIPTION_REGEX, 'description') - uploader = extract_field(self._UPLOADER_REGEX, 'uploader') - view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) - comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) - average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) - - categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') - categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] - return { 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': (extract_field(self._TITLE_REGEX, 'title') + or self._og_search_title(webpage, default=None) + or json_ld.get('title')), + 'description': extract_field(self._DESCRIPTION_REGEX, 'description') or json_ld.get('description'), 'thumbnails': thumbnails, - 'duration': duration, - 'age_limit': age_limit, - 'uploader': uploader, - 'view_count': view_count, - 'comment_count': comment_count, - 'average_rating': average_rating, - 'categories': categories, + 'duration': parse_duration( + self._html_search_meta('duration', webpage, 'duration', default=None)) or json_ld.get('duration'), + 'age_limit': self._rta_search(webpage) or 18, + 'uploader': extract_field(self._UPLOADER_REGEX, 'uploader') or json_ld.get('uploader'), + 'view_count': str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')), + 'comment_count': str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')), + 'average_rating': float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')), + 'categories': list(map(str.strip, (extract_field(self._CATEGORIES_REGEX, 'categories') or '').split(','))), 'formats': formats, } class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' + _VALID_URL = r'https?://player\.(?P<host>tnaflix|empflix)\.com/video/(?P<id>\d+)' _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1'] - _TITLE_REGEX = r'<title>([^<]+)' - _TESTS = [{ 'url': 'https://player.tnaflix.com/video/6538', 'info_dict': { 'id': '6538', 'display_id': '6538', 'ext': 'mp4', - 'title': 'Educational xxx video', + 'title': 'Educational xxx video (G Spot)', + 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, + 'duration': 164, + 'uploader': 'bobwhite39', + 'categories': list, }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://player.empflix.com/video/33051', + 'url': 'http://player.empflix.com/video/33051', 'only_matching': True, }] + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, host = mobj.group('id', 'host') + return self.url_result(f'http://www.{host}.com/category/{video_id}/video{video_id}') + class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' @@ -210,7 +222,7 @@ class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): class TNAFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Ptnaflix)\.com/[^/]+/(?P[^/]+)/video(?P\d+)' _TITLE_REGEX = r'(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' @@ -226,17 +238,17 @@ class TNAFlixIE(TNAEMPFlixBaseIE): 'thumbnail': r're:https?://.*\.jpg$', 'duration': 91, 'age_limit': 18, - 'categories': ['Porn Stars'], + 'categories': list, } }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': '0f5d4d490dbfd117b8607054248a07c0', + 'md5': 'add5a9fa7f4da53d3e9d0845ac58f20c', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', 'ext': 'mp4', - 'title': 'Educational xxx video', + 'title': 'Educational xxx video (G Spot)', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': r're:https?://.*\.jpg$', 'duration': 164, @@ -251,14 +263,11 @@ class TNAFlixIE(TNAEMPFlixBaseIE): class EMPFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P.+?)-|[^/]+/(?P[^/]+)/video)(?P[0-9]+)' - - _HOST = 'emp' - _VKEY_SUFFIX = '-1' + _VALID_URL = r'https?://(?:www\.)?(?Pempflix)\.com/(?:videos/(?P.+?)-|[^/]+/(?P[^/]+)/video)(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'bc30d48b91a7179448a0bda465114676', + 'url': 'http://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'md5': 'd761c7b26601bd14476cd9512f2654fc', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', @@ -268,20 +277,20 @@ class EMPFlixIE(TNAEMPFlixBaseIE): 'thumbnail': r're:https?://.*\.jpg$', 'duration': 83, 'age_limit': 18, - 'uploader': 'cwbike', - 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], + 'uploader': None, + 'categories': list, } }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, }, { - 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', 'only_matching': True, }] class MovieFapIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P[0-9a-f]+)/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?Pmoviefap)\.com/videos/(?P[0-9a-f]+)/(?P[^/]+)\.html' _VIEW_COUNT_REGEX = r'
Views\s*([\d,.]+)' _COMMENT_COUNT_REGEX = r']+id="comCount"[^>]*>([\d,.]+)' @@ -323,5 +332,6 @@ class MovieFapIE(TNAFlixNetworkBaseIE): 'comment_count': int, 'average_rating': float, 'categories': ['Amateur', 'Teen'], - } + }, + 'skip': 'This video does not exist', }] From c6989aa3ae5d79137cf6e4228220ad620519bcbd Mon Sep 17 00:00:00 2001 From: sam Date: Wed, 12 Oct 2022 22:55:42 +1300 Subject: [PATCH 263/284] [extractor/aeon] Add extractor (#5205) Closes #1653 Authored by: DoubleCouponDay --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/aeonco.py | 40 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 yt_dlp/extractor/aeonco.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d514f9a894..1dcbf71eff 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -65,6 +65,7 @@ HistoryPlayerIE, BiographyIE, ) +from .aeonco import AeonCoIE from .afreecatv import ( AfreecaTVIE, AfreecaTVLiveIE, diff --git a/yt_dlp/extractor/aeonco.py b/yt_dlp/extractor/aeonco.py new file mode 100644 index 0000000000..4655862e3f --- /dev/null +++ b/yt_dlp/extractor/aeonco.py @@ -0,0 +1,40 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE + + +class AeonCoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aeon\.co/videos/(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://aeon.co/videos/raw-solar-storm-footage-is-the-punk-rock-antidote-to-sleek-james-webb-imagery', + 'md5': 'e5884d80552c9b6ea8d268a258753362', + 'info_dict': { + 'id': '1284717', + 'ext': 'mp4', + 'title': 'Brilliant Noise', + 'thumbnail': 'https://i.vimeocdn.com/video/21006315-1a1e49da8b07fd908384a982b4ba9ff0268c509a474576ebdf7b1392f4acae3b-d_960', + 'uploader': 'Semiconductor', + 'uploader_id': 'semiconductor', + 'uploader_url': 'https://vimeo.com/semiconductor', + 'duration': 348 + } + }, { + 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', + 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'info_dict': { + 'id': '728595228', + 'ext': 'mp4', + 'title': 'Wrought', + 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', + 'uploader': 'Biofilm Productions', + 'uploader_id': 'user140352216', + 'uploader_url': 'https://vimeo.com/user140352216', + 'duration': 1344 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + vimeo_id = self._search_regex(r'hosterId":\s*"(?P[0-9]+)', webpage, 'vimeo id') + vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') + return self.url_result(vimeo_url, VimeoIE) From a71b812f53a5f678e4c9467858e721dcd4953a16 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Wed, 12 Oct 2022 22:22:17 +0200 Subject: [PATCH 264/284] [utils] `js_to_json`: Improve escape handling (#5217) Authored by: Grub4K --- test/test_utils.py | 6 +++++ yt_dlp/utils.py | 59 ++++++++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index df23f1f47b..49ab3796b9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1100,6 +1100,12 @@ def test_js_to_json_edgecases(self): on = js_to_json('[1,//{},\n2]') self.assertEqual(json.loads(on), [1, 2]) + on = js_to_json(R'"\^\$\#"') + self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped') + + on = js_to_json('\'"\\""\'') + self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped') + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 6cfbcdb8db..adb7c0e8c5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3275,6 +3275,8 @@ def strip_jsonp(code): def js_to_json(code, vars={}, *, strict=False): # vars is a dict of var, val pairs to substitute + STRING_QUOTES = '\'"' + STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES) COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' INTEGER_TABLE = ( @@ -3282,6 +3284,15 @@ def js_to_json(code, vars={}, *, strict=False): (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8), ) + def process_escape(match): + JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' + escape = match.group(1) or match.group(2) + + return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES + else R'\u00' if escape == 'x' + else '' if escape == '\n' + else escape) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): @@ -3289,28 +3300,25 @@ def fix_kv(m): elif v in ('undefined', 'void 0'): return 'null' elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return "" + return '' - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - else: - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + if v[0] in STRING_QUOTES: + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1]) + return f'"{escaped}"' - if v in vars: - return json.dumps(vars[v]) - if strict: - raise ValueError(f'Unknown value: {v}') + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return f'"{i}":' if v.endswith(':') else str(i) - return '"%s"' % v + if v in vars: + return json.dumps(vars[v]) + + if not strict: + return f'"{v}"' + + raise ValueError(f'Unknown value: {v}') def create_map(mobj): return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) @@ -3320,15 +3328,14 @@ def create_map(mobj): code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) - return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| + return re.sub(rf'''(?sx) + {STRING_RE}| + {COMMENT_RE}|,(?={SKIP_RE}[\]}}])| void\s0|(?:(? Date: Thu, 13 Oct 2022 04:21:50 +0530 Subject: [PATCH 265/284] Do more processing in `--flat-playlist` --- yt_dlp/YoutubeDL.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e1c24b8925..39df79a3fa 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1621,6 +1621,7 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): self.add_default_extra_info(info_copy, ie, ie_result['url']) self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) + self._fill_common_fields(info_copy, False) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): @@ -2379,10 +2380,9 @@ def check_thumbnails(thumbnails): else: info_dict['thumbnails'] = thumbnails - def _fill_common_fields(self, info_dict, is_video=True): + def _fill_common_fields(self, info_dict, final=True): # TODO: move sanitization here - if is_video: - # playlists are allowed to lack "title" + if final: title = info_dict.get('title', NO_DEFAULT) if title is NO_DEFAULT: raise ExtractorError('Missing "title" field in extractor result', @@ -2432,7 +2432,7 @@ def _fill_common_fields(self, info_dict, is_video=True): # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. for field in ('chapter', 'season', 'episode'): - if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) def _raise_pending_errors(self, info): From 5225df50cf96d2f462dc3df3c22f8d1e2028872d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 13 Oct 2022 04:23:39 +0530 Subject: [PATCH 266/284] [extractor/youtube:tab] Let `approximate_date` return timestamp --- README.md | 2 +- yt_dlp/extractor/common.py | 4 ++-- yt_dlp/extractor/youtube.py | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9b59e096a9..7374e0e947 100644 --- a/README.md +++ b/README.md @@ -1724,7 +1724,7 @@ #### youtube #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) -* `approximate_date`: Extract approximate `upload_date` in flat-playlist. This may cause date-based filters to be slightly off +* `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 10d44d95a7..ab8def57da 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3843,8 +3843,8 @@ def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense= @param default The default value to return when the key is not present (default: []) @param casesense When false, the values are converted to lower case ''' - val = traverse_obj( - self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) + ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key() + val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 35e41753a2..73c37ac90e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -948,9 +948,9 @@ def _extract_video(self, renderer): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - 'upload_date': (strftime_or_none(self._parse_time_text(time_text), '%Y%m%d') - if self._configuration_arg('approximate_date', ie_key='youtubetab') - else None), + 'timestamp': (self._parse_time_text(time_text) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None), 'release_timestamp': scheduled_timestamp, 'availability': 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) @@ -6105,9 +6105,9 @@ def _extract_notification_renderer(self, notification): title = self._search_regex( rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title, 'video title', default=None) - upload_date = (strftime_or_none(self._parse_time_text(self._get_text(notification, 'sentTimeText')), '%Y%m%d') - if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE.ie_key()) - else None) + timestamp = (self._parse_time_text(self._get_text(notification, 'sentTimeText')) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None) return { '_type': 'url', 'url': url, @@ -6117,7 +6117,7 @@ def _extract_notification_renderer(self, notification): 'channel_id': channel_id, 'channel': channel, 'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'), - 'upload_date': upload_date, + 'timestamp': timestamp, } def _notification_menu_entries(self, ytcfg): From 34f00179db37b963d6c8ce8703877a06aa7f1195 Mon Sep 17 00:00:00 2001 From: lauren Date: Fri, 14 Oct 2022 03:41:08 +0200 Subject: [PATCH 267/284] [extractor/cda]: Support login through API (#5100) Authored by: selfisekai --- yt_dlp/extractor/cda.py | 82 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 6d01c60d5e..2a12b054be 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -1,4 +1,8 @@ +import base64 import codecs +import datetime +import hashlib +import hmac import json import re @@ -12,6 +16,8 @@ multipart_encode, parse_duration, random_birthday, + traverse_obj, + try_call, try_get, urljoin, ) @@ -19,7 +25,18 @@ class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _NETRC_MACHINE = 'cdapl' + _BASE_URL = 'http://www.cda.pl/' + _BASE_API_URL = 'https://api.cda.pl' + _API_HEADERS = { + 'Accept': 'application/vnd.cda.public+json', + 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)', + } + # hardcoded in the app + _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q' + _BEARER_CACHE = 'cda-bearer' + _TESTS = [{ 'url': 'http://www.cda.pl/video/5749950c', 'md5': '6f844bf51b15f31fae165365707ae970', @@ -83,8 +100,73 @@ def _download_age_confirm_page(self, url, video_id, *args, **kwargs): 'Content-Type': content_type, }, **kwargs) + def _perform_login(self, username, password): + cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} + if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: + self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' + return + + password_hash = base64.urlsafe_b64encode(hmac.new( + b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P', + ''.join(f'{bytes((bt & 255, )).hex():0>2}' + for bt in hashlib.md5(password.encode()).digest()).encode(), + hashlib.sha256).digest()).decode().replace('=', '') + + token_res = self._download_json( + f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'', + headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH}, + query={ + 'grant_type': 'password', + 'login': username, + 'password': password_hash, + }) + self.cache.store(self._BEARER_CACHE, username, { + 'token': token_res['access_token'], + 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(), + }) + self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}' + def _real_extract(self, url): video_id = self._match_id(url) + + if 'Authorization' in self._API_HEADERS: + return self._api_extract(video_id) + else: + return self._web_extract(video_id, url) + + def _api_extract(self, video_id): + meta = self._download_json( + f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video'] + + if meta.get('premium') and not meta.get('premium_free'): + self.report_drm(video_id) + + uploader = traverse_obj(meta, 'author', 'login') + + formats = [{ + 'url': quality['file'], + 'format': quality.get('title'), + 'resolution': quality.get('name'), + 'height': try_call(lambda: int(quality['name'][:-1])), + 'filesize': quality.get('length'), + } for quality in meta['qualities'] if quality.get('file')] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': meta.get('title'), + 'description': meta.get('description'), + 'uploader': None if uploader == 'anonim' else uploader, + 'average_rating': float_or_none(meta.get('rating')), + 'thumbnail': meta.get('thumb'), + 'formats': formats, + 'duration': meta.get('duration'), + 'age_limit': 18 if meta.get('for_adults') else 0, + 'view_count': meta.get('views'), + } + + def _web_extract(self, video_id, url): self._set_cookie('cda.pl', 'cda.player', 'html5') webpage = self._download_webpage( self._BASE_URL + '/video/' + video_id, video_id) From d51b2816e33860f3e2a86bda431e31e48cb2e020 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 14 Oct 2022 06:46:24 +0530 Subject: [PATCH 268/284] [extractor/iq] Increase phantomjs timeout Closes #5161 --- yt_dlp/extractor/iqiyi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index 6a43846c17..bb77647f8c 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -588,8 +588,9 @@ def _real_extract(self, url): ut_list = ['0'] # bid 0 as an initial format checker - dash_paths = self._parse_json(PhantomJSwrapper(self).get( - url, html='', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % { + dash_paths = self._parse_json(PhantomJSwrapper(self, timeout=120_000).get( + url, note2='Executing signature code (this may take a couple minutes)', + html='', video_id=video_id, jscode=self._DASH_JS % { 'tvid': video_info['tvId'], 'vid': video_info['vid'], 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), From 6678a4f0b3074f41f02e968d1d48d7c64e48ef07 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 14 Oct 2022 07:41:53 +0530 Subject: [PATCH 269/284] [extractor/youtube] Fix live_status Bug in 4d37720a0c5f1c9c4768ea20b0f943277f55bc12 --- yt_dlp/extractor/youtube.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 73c37ac90e..857c9670c5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3684,17 +3684,13 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, is_live = get_first(live_broadcast_details, 'isLiveNow') live_content = get_first(video_details, 'isLiveContent') is_upcoming = get_first(video_details, 'isUpcoming') - if is_live is None and is_upcoming or live_content is False: - is_live = False - if is_upcoming is None and (live_content or is_live): - is_upcoming = False post_live = get_first(video_details, 'isPostLiveDvr') live_status = ('post_live' if post_live else 'is_live' if is_live else 'is_upcoming' if is_upcoming - else None if None in (is_live, is_upcoming, live_content) - else 'was_live' if live_content else 'not_live') - + else 'was_live' if live_content + else 'not_live' if False in (is_live, live_content) + else None) streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) From 6dca2aa66de8a142543d5c8b6ccadd251339648e Mon Sep 17 00:00:00 2001 From: Matthew Date: Fri, 14 Oct 2022 17:32:52 +1300 Subject: [PATCH 270/284] [extractor/generic:quoted-html] Add extractor (#5213) Extracts embeds from escaped HTML within `data-html` attribute. Related: https://github.com/ytdl-org/youtube-dl/issues/21294, https://github.com/yt-dlp/yt-dlp/pull/5121 Authored by: coletdjnz Co-authored-by: pukkandan --- yt_dlp/extractor/_extractors.py | 6 ++- yt_dlp/extractor/generic.py | 22 -------- yt_dlp/extractor/genericembeds.py | 86 ++++++++++++++++++++++++++++++- yt_dlp/extractor/tv24ua.py | 62 ---------------------- 4 files changed, 89 insertions(+), 87 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1dcbf71eff..8652ec54e5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -698,7 +698,10 @@ HSEShowIE, HSEProductIE, ) -from .genericembeds import HTML5MediaEmbedIE +from .genericembeds import ( + HTML5MediaEmbedIE, + QuotedHTMLIE, +) from .huajiao import HuajiaoIE from .huya import HuyaLiveIE from .huffpost import HuffPostIE @@ -1884,7 +1887,6 @@ ) from .tv24ua import ( TV24UAVideoIE, - TV24UAGenericPassthroughIE ) from .tv2dk import ( TV2DKIE, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index ad4e3c5b87..b7a5ffb5b1 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1980,22 +1980,6 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, - { - # Squarespace video embed, 2019-08-28 - 'url': 'http://ootboxford.com', - 'info_dict': { - 'id': 'Tc7b_JGdZfw', - 'title': 'Out of the Blue, at Childish Things 10', - 'ext': 'mp4', - 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', - 'uploader_id': 'helendouglashouse', - 'uploader': 'Helen & Douglas House', - 'upload_date': '20140328', - }, - 'params': { - 'skip_download': True, - }, - }, # { # # Zype embed # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2784,12 +2768,6 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): # There probably should be a second run of generic extractor on unescaped webpage. # webpage = urllib.parse.unquote(webpage) - # Unescape squarespace embeds to be detected by generic extractor, - # see https://github.com/ytdl-org/youtube-dl/issues/21294 - webpage = re.sub( - r']+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', - lambda x: unescapeHTML(x.group(0)), webpage) - # TODO: Move to respective extractors bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: diff --git a/yt_dlp/extractor/genericembeds.py b/yt_dlp/extractor/genericembeds.py index 64bd20e3af..1bffe275a8 100644 --- a/yt_dlp/extractor/genericembeds.py +++ b/yt_dlp/extractor/genericembeds.py @@ -1,5 +1,8 @@ +import re +import urllib.parse + from .common import InfoExtractor -from ..utils import make_archive_id +from ..utils import make_archive_id, unescapeHTML class HTML5MediaEmbedIE(InfoExtractor): @@ -29,3 +32,84 @@ def _extract_from_webpage(self, url, webpage): }) self._sort_formats(entry['formats']) yield entry + + +class QuotedHTMLIE(InfoExtractor): + """For common cases of quoted/escaped html parts in the webpage""" + _VALID_URL = False + IE_NAME = 'generic:quoted-html' + IE_DESC = False # Do not list + _WEBPAGE_TESTS = [{ + # 2 YouTube embeds in data-html + 'url': 'https://24tv.ua/bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'info_dict': { + 'id': 'bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'title': 'Броньовик Wolfhound: гігант, який допомагає ЗСУ знищувати окупантів на фронті', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'timestamp': float, + 'upload_date': str, + 'description': 'md5:6816e1e5a65304bd7898e4c7eb1b26f7', + 'age_limit': 0, + }, + 'playlist_count': 2 + }, { + # Generic iframe embed of TV24UAPlayerIE within data-html + 'url': 'https://24tv.ua/harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', + 'info_dict': { + 'id': '1887584', + 'ext': 'mp4', + 'title': 'Харків\'яни згадують місто до війни: щемливе відео', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'params': {'skip_download': True} + }, { + # YouTube embeds on Squarespace (data-html): https://github.com/ytdl-org/youtube-dl/issues/21294 + 'url': 'https://www.harvardballetcompany.org/past-productions', + 'info_dict': { + 'id': 'past-productions', + 'title': 'Productions — Harvard Ballet Company', + 'age_limit': 0, + 'description': 'Past Productions', + }, + 'playlist_mincount': 26 + }, { + # Squarespace video embed, 2019-08-28, data-html + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + 'availability': 'public', + 'view_count': int, + 'channel': 'Helen & Douglas House', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/user/helendouglashouse', + 'duration': 253, + 'channel_url': 'https://www.youtube.com/channel/UCTChGezrZVmlYlpMlkmulPA', + 'playable_in_embed': True, + 'age_limit': 0, + 'channel_follower_count': int, + 'channel_id': 'UCTChGezrZVmlYlpMlkmulPA', + 'tags': 'count:6', + 'categories': ['Nonprofits & Activism'], + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/Tc7b_JGdZfw/hqdefault.jpg', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _extract_from_webpage(self, url, webpage): + combined = '' + for _, html in re.findall(r'(?s)\bdata-html=(["\'])((?:(?!\1).)+)\1', webpage): + # unescapeHTML can handle " etc., unquote can handle percent encoding + unquoted_html = unescapeHTML(urllib.parse.unquote(html)) + if unquoted_html != html: + combined += unquoted_html + if combined: + yield from self._extract_generic_embeds(url, combined) diff --git a/yt_dlp/extractor/tv24ua.py b/yt_dlp/extractor/tv24ua.py index 723049e781..553a70b6b2 100644 --- a/yt_dlp/extractor/tv24ua.py +++ b/yt_dlp/extractor/tv24ua.py @@ -1,15 +1,10 @@ -import base64 import re -import urllib.parse from .common import InfoExtractor from ..utils import ( determine_ext, - extract_attributes, - get_elements_html_by_class, js_to_json, mimetype2ext, - smuggle_url, traverse_obj, ) @@ -87,60 +82,3 @@ def _real_extract(self, url): 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), 'description': self._og_search_description(webpage, default=None), } - - -class TV24UAGenericPassthroughIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-zA-Z0-9]+?\.)?24tv\.ua/(?P[^/]+?_n\d+)' - - _TESTS = [{ - # Generic iframe, not within media_embed - 'url': 'https://24tv.ua/vipalyuyut-nashi-mista-sela-dsns-pokazali-motoroshni-naslidki_n1883966', - 'info_dict': { - 'id': '1883966', - 'ext': 'mp4', - 'title': 'Випалюють наші міста та села, – моторошні наслідки обстрілів на Чернігівщині', - 'thumbnail': r're:^https?://.*\.jpe?g', - } - }, { - # Generic iframe embed of TV24UAPlayerIE, within media_embed - 'url': 'https://24tv.ua/harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', - 'info_dict': { - 'id': 'harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', - 'title': 'Харків\'яни згадують місто до війни: щемливе відео' - }, - 'playlist': [{ - 'info_dict': { - 'id': '1887584', - 'ext': 'mp4', - 'title': 'Харків\'яни згадують місто до війни: щемливе відео', - 'thumbnail': r're:^https?://.*\.jpe?g', - }, - }] - }, { - # 2 media_embeds with YouTube iframes - 'url': 'https://24tv.ua/bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', - 'info_dict': { - 'id': 'bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', - 'title': 'Броньовик Wolfhound: гігант, який допомагає ЗСУ знищувати окупантів на фронті', - }, - 'playlist_count': 2 - }, { - 'url': 'https://men.24tv.ua/fitnes-bloger-sprobuvav-vikonati-trenuvannya-naysilnishoyi-lyudini_n2164538', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - data_urls = [] - # The site contains escaped iframe embeds within an attribute. - # Once escaped, generic can handle them, so we use a data url to pass the escaped html back. - for html in get_elements_html_by_class('media_embed', webpage): - data = urllib.parse.unquote(extract_attributes(html).get('data-html')) - data_urls.append(f'data:text/html;base64,{base64.b64encode(data.encode("utf-8")).decode("utf-8")}') - - if not data_urls: - return self.url_result(url, 'Generic') - return self.playlist_from_matches( - [smuggle_url(url, {'to_generic': True}) for url in data_urls], display_id, ie='Generic', - playlist_title=self._og_search_title(webpage) or self._html_extract_title(webpage)) From 9b9dad119a5307fb847aa5626d9391b59f1865d5 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 14 Oct 2022 11:48:45 +0530 Subject: [PATCH 271/284] [outtmpl] Ensure ASCII in json and add option for Unicode Closes #5236 --- README.md | 2 +- yt_dlp/YoutubeDL.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7374e0e947..7b2c6ba717 100644 --- a/README.md +++ b/README.md @@ -1189,7 +1189,7 @@ # OUTPUT TEMPLATE 1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s` -1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, yt-dlp additionally supports converting to `B` = **B**ytes, `j` = **j**son (flag `#` for pretty-printing), `h` = HTML escaping, `l` = a comma separated **l**ist (flag `#` for `\n` newline-separated), `q` = a string **q**uoted for the terminal (flag `#` to split a list into different arguments), `D` = add **D**ecimal suffixes (e.g. 10M) (flag `#` to use 1024 as factor), and `S` = **S**anitize as filename (flag `#` for restricted) +1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, yt-dlp additionally supports converting to `B` = **B**ytes, `j` = **j**son (flag `#` for pretty-printing, `+` for Unicode), `h` = HTML escaping, `l` = a comma separated **l**ist (flag `#` for `\n` newline-separated), `q` = a string **q**uoted for the terminal (flag `#` to split a list into different arguments), `D` = add **D**ecimal suffixes (e.g. 10M) (flag `#` to use 1024 as factor), and `S` = **S**anitize as filename (flag `#` for restricted) 1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. E.g. `%(title)+.100U` is NFKC diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 39df79a3fa..4e57dffa32 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1249,7 +1249,7 @@ def create_key(outer_mobj): elif fmt[-1] == 'j': # json value, fmt = json.dumps( value, default=_dumpjson_default, - indent=4 if '#' in flags else None, ensure_ascii=False), str_fmt + indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt elif fmt[-1] == 'h': # html value, fmt = escapeHTML(str(value)), str_fmt elif fmt[-1] == 'q': # quoted From 42a44f01c3f3be9c2af7d91807f0eb85168815e4 Mon Sep 17 00:00:00 2001 From: Vitaly Khabarov Date: Sat, 15 Oct 2022 11:46:08 +0300 Subject: [PATCH 272/284] [extractor/Fox] Extract thumbnail (#5243) Closes #1679 Authored by: vitkhab --- yt_dlp/extractor/fox.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/fox.py b/yt_dlp/extractor/fox.py index 5996e86bb7..53826630fe 100644 --- a/yt_dlp/extractor/fox.py +++ b/yt_dlp/extractor/fox.py @@ -12,8 +12,10 @@ int_or_none, parse_age_limit, parse_duration, + traverse_obj, try_get, unified_timestamp, + url_or_none, ) @@ -34,7 +36,8 @@ class FOXIE(InfoExtractor): 'creator': 'FOX', 'series': 'Gotham', 'age_limit': 14, - 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight' + 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, @@ -165,6 +168,7 @@ def _real_extract(self, url): 'season_number': int_or_none(video.get('seasonNumber')), 'episode': video.get('name'), 'episode_number': int_or_none(video.get('episodeNumber')), + 'thumbnail': traverse_obj(video, ('images', 'still', 'raw'), expected_type=url_or_none), 'release_year': int_or_none(video.get('releaseYear')), 'subtitles': subtitles, } From 217753f4aa184a5dac0d7c91c1f95de8b1880474 Mon Sep 17 00:00:00 2001 From: Matthew Date: Mon, 17 Oct 2022 18:46:24 +1300 Subject: [PATCH 273/284] [extractor/YoutubeWebArchive] Improve metadata extraction (#4968) Closes https://github.com/yt-dlp/yt-dlp/issues/4574 Authored by: coletdjnz Co-authored-by: pukkandan --- yt_dlp/extractor/archiveorg.py | 283 ++++++++++++++++++++++++++++----- 1 file changed, 239 insertions(+), 44 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 25a289ff62..4218f52d68 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -16,6 +16,7 @@ get_element_by_id, int_or_none, join_nonempty, + js_to_json, merge_dicts, mimetype2ext, orderedSet, @@ -367,7 +368,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', 'duration': 32, 'uploader_id': 'Zeurel', - 'uploader_url': 'http://www.youtube.com/user/Zeurel' + 'uploader_url': 'https://www.youtube.com/user/Zeurel', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg', } }, { # Internal link @@ -382,7 +385,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', 'duration': 771, 'uploader_id': '1veritasium', - 'uploader_url': 'http://www.youtube.com/user/1veritasium' + 'uploader_url': 'https://www.youtube.com/user/1veritasium', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA', } }, { # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. @@ -396,7 +401,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 398, 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', 'uploader_id': 'machinima', - 'uploader_url': 'http://www.youtube.com/user/machinima' + 'uploader_url': 'https://www.youtube.com/user/machinima', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'machinima' } }, { # FLV video. Video file URL does not provide itag information @@ -410,7 +417,10 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 19, 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', 'uploader_id': 'jawed', - 'uploader_url': 'http://www.youtube.com/user/jawed' + 'uploader_url': 'https://www.youtube.com/user/jawed', + 'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'jawed', } }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', @@ -424,7 +434,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 204, 'description': 'md5:f7535343b6eda34a314eff8b85444680', 'uploader_id': 'itsmadeon', - 'uploader_url': 'http://www.youtube.com/user/itsmadeon' + 'uploader_url': 'https://www.youtube.com/user/itsmadeon', + 'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w', + 'thumbnail': r're:https?://.*\.(jpg|webp)', } }, { # First capture is of dead video, second is the oldest from CDX response. @@ -435,10 +447,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', 'upload_date': '20160218', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 1236, + 'duration': 1235, 'description': 'md5:21032bae736421e89c2edf36d1936947', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', } }, { # First capture of dead video, capture date in link links to dead capture. @@ -449,10 +464,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', 'upload_date': '20160219', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 798, + 'duration': 797, 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', }, 'expected_warnings': [ r'unable to download capture webpage \(it may not be archived\)' @@ -472,12 +490,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'It\'s Bootleg AirPods Time.', 'upload_date': '20211021', 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', 'duration': 810, 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'thumbnail': r're:https?://.*\.(jpg|webp)', 'uploader': 'DankPods', - 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' } }, { # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 @@ -488,12 +505,135 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'bitch lasagna', 'upload_date': '20181005', 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'duration': 135, 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', 'uploader': 'PewDiePie', 'uploader_id': 'PewDiePie', - 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + 'uploader_url': 'https://www.youtube.com/user/PewDiePie', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~June 2010 Capture. swfconfig + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y', + 'info_dict': { + 'id': '8XeW5ilk-9Y', + 'ext': 'flv', + 'title': 'Story of Stuff, The Critique Part 4 of 4', + 'duration': 541, + 'description': 'md5:28157da06f2c5e94c97f7f3072509972', + 'uploader': 'HowTheWorldWorks', + 'uploader_id': 'HowTheWorldWorks', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090520', + } + }, { + # Jan 2011: watch-video-date/eow-date surrounded by whitespace + 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc', + 'info_dict': { + 'id': 'Q_yjX80U7Yc', + 'ext': 'flv', + 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest', + 'uploader_id': 'claybutlermusic', + 'description': 'md5:4595264559e3d0a0ceb3f011f6334543', + 'upload_date': '20090803', + 'uploader': 'claybutlermusic', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 132, + 'uploader_url': 'https://www.youtube.com/user/claybutlermusic', + } + }, { + # ~May 2009 swfArgs. ytcfg is spread out over various vars + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY', + 'info_dict': { + 'id': 'c5uJgG05xUY', + 'ext': 'webm', + 'title': 'Story of Stuff, The Critique Part 1 of 4', + 'uploader_id': 'HowTheWorldWorks', + 'uploader': 'HowTheWorldWorks', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090513', + 'description': 'md5:4ca77d79538064e41e4cc464e93f44f0', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 754, + } + }, { + # ~June 2012. Upload date is in another lang so cannot extract. + 'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA', + 'info_dict': { + 'id': 'xWTLLl-dQaA', + 'ext': 'mp4', + 'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)', + 'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy', + 'description': 'md5:e25f0133aaf9e6793fb81c18021d193e', + 'uploader_id': 'BlackNerdComedy', + 'uploader': 'BlackNerdComedy', + 'duration': 182, + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~July 2013 + 'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM', + 'info_dict': { + 'id': '9eO1aasHyTM', + 'ext': 'mp4', + 'title': 'Polar-oid', + 'description': 'Cameras and bears are dangerous!', + 'uploader_url': 'https://www.youtube.com/user/punkybird', + 'uploader_id': 'punkybird', + 'duration': 202, + 'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ', + 'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ', + 'upload_date': '20060428', + 'uploader': 'punkybird', + } + }, { + # April 2020: Player response in player config + 'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en', + 'info_dict': { + 'id': 'Cf7vS8jc7dY', + 'ext': 'mp4', + 'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated', + 'duration': 64, + 'upload_date': '20200408', + 'uploader_id': 'GameGrumps', + 'uploader': 'GameGrumps', + 'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ', + 'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:c625bb3c02c4f5fb4205971e468fa341', + 'uploader_url': 'https://www.youtube.com/user/GameGrumps', + } + }, { + # watch7-user-header with yt-user-info + 'url': 'ytarchive:kbh4T_b4Ixw:20160307085057', + 'info_dict': { + 'id': 'kbh4T_b4Ixw', + 'ext': 'mp4', + 'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix', + 'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA', + 'uploader': 'Nelward music', + 'duration': 213, + 'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'upload_date': '20150503', + 'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA', + } + }, { + # April 2012 + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU', + 'info_dict': { + 'id': 'SOm7mPoPskU', + 'ext': 'mp4', + 'title': 'Boyfriend - Justin Bieber Parody', + 'uploader_url': 'https://www.youtube.com/user/thecomputernerd01', + 'uploader': 'thecomputernerd01', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:dd7fa635519c2a5b4d566beaecad7491', + 'duration': 200, + 'upload_date': '20120407', + 'uploader_id': 'thecomputernerd01', } }, { 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', @@ -574,6 +714,27 @@ def _extract_metadata(self, video_id, webpage): initial_data = self._search_json( self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={}) + ytcfg = {} + for j in re.findall(r'yt\.setConfig\(\s*(?P{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010 + ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {}) + + # XXX: this also may contain a 'ptchn' key + player_config = ( + self._search_json( + r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=', + webpage, 'player config', video_id, default=None) + or ytcfg.get('PLAYER_CONFIG') or {}) + + # XXX: this may also contain a 'creator' key. + swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={}) + if swf_args and not traverse_obj(player_config, ('args',)): + player_config['args'] = swf_args + + if not player_response: + # April 2020 + player_response = self._parse_json( + traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False) + initial_data_video = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), expected_type=dict, get_all=False, default={}) @@ -588,21 +749,64 @@ def _extract_metadata(self, video_id, webpage): video_details.get('title') or YoutubeBaseInfoExtractor._get_text(microformats, 'title') or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or traverse_obj(player_config, ('args', 'title')) or self._extract_webpage_title(webpage) or search_meta(['og:title', 'twitter:title', 'title'])) + def id_from_url(url, type_): + return self._search_regex( + rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None) + + # XXX: would the get_elements_by_... functions be better suited here? + _CHANNEL_URL_HREF_RE = r'href="[^"]*(?Phttps?://www\.youtube\.com/(?:user|channel)/[^"]+)"' + uploader_or_channel_url = self._search_regex( + [fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024 + fr']*>\s*]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012 + webpage, 'uploader or channel url', default=None) + + owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2 + + # Uploader refers to the /user/ id ONLY + uploader_id = ( + id_from_url(owner_profile_url, 'user') + or id_from_url(uploader_or_channel_url, 'user') + or ytcfg.get('VIDEO_USERNAME')) + uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None + + # XXX: do we want to differentiate uploader and channel? + uploader = ( + self._search_regex( + [r']*>\s*([^<]+)', # June 2010 + r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009 + r']*>\s*]*>\s*(.+?)\s*]*title="\s*(.+?)\s*"'], # ~June 2012 + webpage, 'uploader', default=None) + or self._html_search_regex( + [r'(?s)]*[^>]*>\s*(.*?)\s*]*yt-user-name[^>]*>\s*(.*?)\s*(?:(?!\1).)+)\1', # @b45a9e6 - webpage, 'channel id', default=None, group='id')) - channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + webpage, 'channel id', default=None, group='id') + or id_from_url(owner_profile_url, 'channel') + or id_from_url(uploader_or_channel_url, 'channel') + or traverse_obj(player_config, ('args', 'ucid'))) + channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None duration = int_or_none( video_details.get('lengthSeconds') or microformats.get('lengthSeconds') + or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False) or parse_duration(search_meta('duration'))) description = ( video_details.get('shortDescription') @@ -610,26 +814,13 @@ def _extract_metadata(self, video_id, webpage): or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 or search_meta(['description', 'og:description', 'twitter:description'])) - uploader = video_details.get('author') - - # Uploader ID and URL - uploader_mobj = re.search( - r'', # @fd05024 - webpage) - if uploader_mobj is not None: - uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') - else: - # @a6211d2 - uploader_url = url_or_none(microformats.get('ownerProfileUrl')) - uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) - upload_date = unified_strdate( dict_get(microformats, ('uploadDate', 'publishDate')) or search_meta(['uploadDate', 'datePublished']) or self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + [r'(?s)id="eow-date.*?>\s*(.*?)\s*', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520 + r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively) webpage, 'upload date', default=None)) return { @@ -698,18 +889,22 @@ def _real_extract(self, url): url_date = url_date or url_date_2 urlh = None - try: - urlh = self._request_webpage( - HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), - video_id, note='Fetching archived video file url', expected_status=True) - except ExtractorError as e: - # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - self.raise_no_formats( - 'The requested video is not archived, indexed, or there is an issue with web.archive.org', - expected=True) - else: - raise + retry_manager = self.RetryManager(fatal=False) + for retry in retry_manager: + try: + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) + except ExtractorError as e: + # HTTP Error 404 is expected if the video is not saved. + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) + else: + retry.error = e + + if retry_manager.error: + self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id) capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) From 2576d53a312efee864af023ea819c6608558bd1b Mon Sep 17 00:00:00 2001 From: cruel-efficiency <60464829+cruel-efficiency@users.noreply.github.com> Date: Tue, 18 Oct 2022 05:51:43 -0700 Subject: [PATCH 274/284] Fix end time of clips (#5255) Closes #5256 Authored by: cruel-efficiency --- yt_dlp/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4e57dffa32..13725cddc3 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2720,7 +2720,8 @@ def to_screen(*msg): if chapter or offset: new_info.update({ 'section_start': offset + chapter.get('start_time', 0), - 'section_end': end_time if end_time < offset + duration else None, + # duration may not be accurate. So allow deviations <1sec + 'section_end': end_time if end_time <= offset + duration + 1 else None, 'section_title': chapter.get('title'), 'section_number': chapter.get('index'), }) From 814bba3933ca36a79c68ac737b805cf25c407521 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 18 Oct 2022 18:33:00 +0530 Subject: [PATCH 275/284] [downloader/fragment] HLS download can continue without first fragment Closes #5274 --- yt_dlp/downloader/dash.py | 2 +- yt_dlp/downloader/f4m.py | 4 +-- yt_dlp/downloader/fragment.py | 49 +++++++++++++------------- yt_dlp/downloader/ism.py | 3 +- yt_dlp/downloader/mhtml.py | 3 +- yt_dlp/downloader/youtube_live_chat.py | 3 +- 6 files changed, 30 insertions(+), 34 deletions(-) diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index a6da26f09d..8723e10689 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -51,7 +51,7 @@ def real_download(self, filename, info_dict): args.append([ctx, fragments_to_download, fmt]) - return self.download_and_append_fragments_multiple(*args) + return self.download_and_append_fragments_multiple(*args, is_fatal=lambda idx: idx == 0) def _resolve_fragments(self, fragments, ctx): fragments = fragments(ctx) if callable(fragments) else fragments diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py index a19ab43f15..306f92192f 100644 --- a/yt_dlp/downloader/f4m.py +++ b/yt_dlp/downloader/f4m.py @@ -424,6 +424,4 @@ def real_download(self, filename, info_dict): msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) self.report_warning(msg) - self._finish_frag_download(ctx, info_dict) - - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index a5d70d0d49..83f7870edb 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -295,16 +295,23 @@ def _finish_frag_download(self, ctx, info_dict): self.try_remove(ytdl_filename) elapsed = time.time() - ctx['started'] - if ctx['tmpfilename'] == '-': - downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + to_file = ctx['tmpfilename'] != '-' + if to_file: + downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) else: + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + + if not downloaded_bytes: + if to_file: + self.try_remove(ctx['tmpfilename']) + self.report_error('The downloaded file is empty') + return False + elif to_file: self.try_rename(ctx['tmpfilename'], ctx['filename']) - if self.params.get('updatetime', True): - filetime = ctx.get('fragment_filetime') - if filetime: - with contextlib.suppress(Exception): - os.utime(ctx['filename'], (time.time(), filetime)) - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + filetime = ctx.get('fragment_filetime') + if self.params.get('updatetime', True) and filetime: + with contextlib.suppress(Exception): + os.utime(ctx['filename'], (time.time(), filetime)) self._hook_progress({ 'downloaded_bytes': downloaded_bytes, @@ -316,6 +323,7 @@ def _finish_frag_download(self, ctx, info_dict): 'max_progress': ctx.get('max_progress'), 'progress_idx': ctx.get('progress_idx'), }, info_dict) + return True def _prepare_external_frag_download(self, ctx): if 'live' not in ctx: @@ -362,7 +370,7 @@ def decrypt_fragment(fragment, frag_content): return decrypt_fragment - def download_and_append_fragments_multiple(self, *args, pack_func=None, finish_func=None): + def download_and_append_fragments_multiple(self, *args, **kwargs): ''' @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... all args must be either tuple or list @@ -370,7 +378,7 @@ def download_and_append_fragments_multiple(self, *args, pack_func=None, finish_f interrupt_trigger = [True] max_progress = len(args) if max_progress == 1: - return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) + return self.download_and_append_fragments(*args[0], **kwargs) max_workers = self.params.get('concurrent_fragment_downloads', 1) if max_progress > 1: self._prepare_multiline_status(max_progress) @@ -380,8 +388,7 @@ def thread_func(idx, ctx, fragments, info_dict, tpe): ctx['max_progress'] = max_progress ctx['progress_idx'] = idx return self.download_and_append_fragments( - ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, - tpe=tpe, interrupt_trigger=interrupt_trigger) + ctx, fragments, info_dict, **kwargs, tpe=tpe, interrupt_trigger=interrupt_trigger) class FTPE(concurrent.futures.ThreadPoolExecutor): # has to stop this or it's going to wait on the worker thread itself @@ -428,17 +435,12 @@ def interrupt_trigger_iter(fg): return result def download_and_append_fragments( - self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, - tpe=None, interrupt_trigger=None): - if not interrupt_trigger: - interrupt_trigger = (True, ) + self, ctx, fragments, info_dict, *, is_fatal=(lambda idx: False), + pack_func=(lambda content, idx: content), finish_func=None, + tpe=None, interrupt_trigger=(True, )): - is_fatal = ( - ((lambda _: False) if info_dict.get('is_live') else (lambda idx: idx == 0)) - if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)) - - if not pack_func: - pack_func = lambda frag_content, _: frag_content + if not self.params.get('skip_unavailable_fragments', True): + is_fatal = lambda _: True def download_fragment(fragment, ctx): if not interrupt_trigger[0]: @@ -527,5 +529,4 @@ def _download_fragment(fragment): if finish_func is not None: ctx['dest_stream'].write(finish_func()) ctx['dest_stream'].flush() - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py index c961dc62e9..a157a8ad93 100644 --- a/yt_dlp/downloader/ism.py +++ b/yt_dlp/downloader/ism.py @@ -280,5 +280,4 @@ def real_download(self, filename, info_dict): return False self.report_skip_fragment(frag_index) - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index ed076e09ed..d977dcec31 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -186,5 +186,4 @@ def real_download(self, filename, info_dict): ctx['dest_stream'].write( b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py index 1bc3209dc4..5928fecf0b 100644 --- a/yt_dlp/downloader/youtube_live_chat.py +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -191,8 +191,7 @@ def download_and_parse_fragment(url, frag_index, request_data=None, headers=None if test: break - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) @staticmethod def parse_live_timestamp(action): From 63c547d71ceae6be181948b4b6ce4180b16f4209 Mon Sep 17 00:00:00 2001 From: Ajay Ramachandran Date: Tue, 18 Oct 2022 12:51:57 -0400 Subject: [PATCH 276/284] [SponsorBlock] Support `chapter` category (#5260) Authored by: ajayyy, pukkandan --- README.md | 6 ++--- test/test_postprocessors.py | 34 +++++++++++++++++++------ yt_dlp/options.py | 4 +-- yt_dlp/postprocessor/modify_chapters.py | 13 +++++----- yt_dlp/postprocessor/sponsorblock.py | 13 +++++++--- 5 files changed, 46 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 7b2c6ba717..e7fc6886a4 100644 --- a/README.md +++ b/README.md @@ -1042,7 +1042,7 @@ ## SponsorBlock Options: for, separated by commas. Available categories are sponsor, intro, outro, selfpromo, preview, filler, interaction, - music_offtopic, poi_highlight, all and + music_offtopic, poi_highlight, chapter, all and default (=all). You can prefix the category with a "-" to exclude it. See [1] for description of the categories. E.g. @@ -1054,8 +1054,8 @@ ## SponsorBlock Options: remove takes precedence. The syntax and available categories are the same as for --sponsorblock-mark except that "default" - refers to "all,-filler" and poi_highlight is - not available + refers to "all,-filler" and poi_highlight and + chapter are not available --sponsorblock-chapter-title TEMPLATE An output template for the title of the SponsorBlock chapters created by diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index c49e3ede0f..52e5587729 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -16,6 +16,7 @@ MetadataFromFieldPP, MetadataParserPP, ModifyChaptersPP, + SponsorBlockPP, ) @@ -76,11 +77,15 @@ def setUp(self): self._pp = ModifyChaptersPP(YoutubeDL()) @staticmethod - def _sponsor_chapter(start, end, cat, remove=False): - c = {'start_time': start, 'end_time': end, '_categories': [(cat, start, end)]} - if remove: - c['remove'] = True - return c + def _sponsor_chapter(start, end, cat, remove=False, title=None): + if title is None: + title = SponsorBlockPP.CATEGORIES[cat] + return { + 'start_time': start, + 'end_time': end, + '_categories': [(cat, start, end, title)], + **({'remove': True} if remove else {}), + } @staticmethod def _chapter(start, end, title=None, remove=False): @@ -130,6 +135,19 @@ def test_remove_marked_arrange_sponsors_ChapterWithSponsors(self): 'c', '[SponsorBlock]: Filler Tangent', 'c']) self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + def test_remove_marked_arrange_sponsors_SponsorBlockChapters(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'chapter', title='sb c1'), + self._sponsor_chapter(15, 16, 'chapter', title='sb c2'), + self._sponsor_chapter(30, 40, 'preview'), + self._sponsor_chapter(50, 60, 'filler')] + expected = self._chapters( + [10, 15, 16, 20, 30, 40, 50, 60, 70], + ['c', '[SponsorBlock]: sb c1', '[SponsorBlock]: sb c1, sb c2', '[SponsorBlock]: sb c1', + 'c', '[SponsorBlock]: Preview/Recap', + 'c', '[SponsorBlock]: Filler Tangent', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + def test_remove_marked_arrange_sponsors_UniqueNamesForOverlappingSponsors(self): chapters = self._chapters([120], ['c']) + [ self._sponsor_chapter(10, 45, 'sponsor'), self._sponsor_chapter(20, 40, 'selfpromo'), @@ -173,7 +191,7 @@ def test_remove_marked_arrange_sponsors_ChapterWithSponsorCutInTheMiddle(self): self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) def test_remove_marked_arrange_sponsors_ChapterWithCutHidingSponsor(self): - cuts = [self._sponsor_chapter(20, 50, 'selpromo', remove=True)] + cuts = [self._sponsor_chapter(20, 50, 'selfpromo', remove=True)] chapters = self._chapters([60], ['c']) + [ self._sponsor_chapter(10, 20, 'intro'), self._sponsor_chapter(30, 40, 'sponsor'), @@ -199,7 +217,7 @@ def test_remove_marked_arrange_sponsors_ChapterWithAdjacentCuts(self): self._sponsor_chapter(10, 20, 'sponsor'), self._sponsor_chapter(20, 30, 'interaction', remove=True), self._chapter(30, 40, remove=True), - self._sponsor_chapter(40, 50, 'selpromo', remove=True), + self._sponsor_chapter(40, 50, 'selfpromo', remove=True), self._sponsor_chapter(50, 60, 'interaction')] expected = self._chapters([10, 20, 30, 40], ['c', '[SponsorBlock]: Sponsor', @@ -282,7 +300,7 @@ def test_remove_marked_arrange_sponsors_SponsorsNoLongerOverlapAfterCut(self): chapters = self._chapters([70], ['c']) + [ self._sponsor_chapter(10, 30, 'sponsor'), self._sponsor_chapter(20, 50, 'interaction'), - self._sponsor_chapter(30, 50, 'selpromo', remove=True), + self._sponsor_chapter(30, 50, 'selfpromo', remove=True), self._sponsor_chapter(40, 60, 'sponsor'), self._sponsor_chapter(50, 60, 'interaction')] expected = self._chapters( diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 5ff375fcfa..d3dfee820a 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1737,7 +1737,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--sponsorblock-remove', metavar='CATS', dest='sponsorblock_remove', default=set(), action='callback', type='str', callback=_set_from_options_callback, callback_kwargs={ - 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.POI_CATEGORIES.keys()), + 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys()), # Note: From https://wiki.sponsor.ajay.app/w/Types: # The filler category is very aggressive. # It is strongly recommended to not use this in a client by default. @@ -1747,7 +1747,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'If a category is present in both mark and remove, remove takes precedence. ' 'The syntax and available categories are the same as for --sponsorblock-mark ' 'except that "default" refers to "all,-filler" ' - f'and {", ".join(SponsorBlockPP.POI_CATEGORIES.keys())} is not available')) + f'and {", ".join(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys())} are not available')) sponsorblock.add_option( '--sponsorblock-chapter-title', metavar='TEMPLATE', default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title', diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py index 6959222c83..b2b1acca40 100644 --- a/yt_dlp/postprocessor/modify_chapters.py +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -16,7 +16,7 @@ def __init__(self, downloader, remove_chapters_patterns=None, remove_sponsor_seg *, sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False): FFmpegPostProcessor.__init__(self, downloader) self._remove_chapters_patterns = set(remove_chapters_patterns or []) - self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.POI_CATEGORIES.keys()) + self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys()) self._ranges_to_remove = set(remove_ranges or []) self._sponsorblock_chapter_title = sponsorblock_chapter_title self._force_keyframes = force_keyframes @@ -99,7 +99,7 @@ def _mark_chapters_to_remove(self, chapters, sponsor_chapters): 'start_time': start, 'end_time': end, 'category': 'manually_removed', - '_categories': [('manually_removed', start, end)], + '_categories': [('manually_removed', start, end, 'Manually removed')], 'remove': True, } for start, end in self._ranges_to_remove) @@ -290,13 +290,12 @@ def _remove_tiny_rename_sponsors(self, chapters): c.pop('_was_cut', None) cats = c.pop('_categories', None) if cats: - category = min(cats, key=lambda c: c[2] - c[1])[0] - cats = orderedSet(x[0] for x in cats) + category, _, _, category_name = min(cats, key=lambda c: c[2] - c[1]) c.update({ 'category': category, - 'categories': cats, - 'name': SponsorBlockPP.CATEGORIES[category], - 'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats] + 'categories': orderedSet(x[0] for x in cats), + 'name': category_name, + 'category_names': orderedSet(x[3] for x in cats), }) c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c.copy()) # Merge identically named sponsors. diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index d79ed7ae77..befff0e1f2 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -14,6 +14,10 @@ class SponsorBlockPP(FFmpegPostProcessor): POI_CATEGORIES = { 'poi_highlight': 'Highlight', } + NON_SKIPPABLE_CATEGORIES = { + **POI_CATEGORIES, + 'chapter': 'Chapter', + } CATEGORIES = { 'sponsor': 'Sponsor', 'intro': 'Intermission/Intro Animation', @@ -23,7 +27,7 @@ class SponsorBlockPP(FFmpegPostProcessor): 'filler': 'Filler Tangent', 'interaction': 'Interaction Reminder', 'music_offtopic': 'Non-Music Section', - **POI_CATEGORIES, + **NON_SKIPPABLE_CATEGORIES } def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): @@ -68,12 +72,13 @@ def duration_filter(s): def to_chapter(s): (start, end), cat = s['segment'], s['category'] + title = s['description'] if cat == 'chapter' else self.CATEGORIES[cat] return { 'start_time': start, 'end_time': end, 'category': cat, - 'title': self.CATEGORIES[cat], - '_categories': [(cat, start, end)] + 'title': title, + '_categories': [(cat, start, end, title)], } sponsor_chapters = [to_chapter(s) for s in duration_match] @@ -89,7 +94,7 @@ def _get_sponsor_segments(self, video_id, service): url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + urllib.parse.urlencode({ 'service': service, 'categories': json.dumps(self._categories), - 'actionTypes': json.dumps(['skip', 'poi']) + 'actionTypes': json.dumps(['skip', 'poi', 'chapter']) }) for d in self._download_json(url) or []: if d['videoID'] == video_id: From 1338ae3ba338d116ab75d787cc6d637d382d0f77 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 18 Oct 2022 23:08:23 +0530 Subject: [PATCH 277/284] [SponsorBlock] Add `type` field --- README.md | 3 ++- yt_dlp/postprocessor/sponsorblock.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e7fc6886a4..5890004565 100644 --- a/README.md +++ b/README.md @@ -1311,10 +1311,11 @@ # OUTPUT TEMPLATE - `start_time` (numeric): Start time of the chapter in seconds - `end_time` (numeric): End time of the chapter in seconds - - `categories` (list): The SponsorBlock categories the chapter belongs to + - `categories` (list): The [SponsorBlock categories](https://wiki.sponsor.ajay.app/w/Types#Category) the chapter belongs to - `category` (string): The smallest SponsorBlock category the chapter belongs to - `category_names` (list): Friendly names of the categories - `name` (string): Friendly name of the smallest category + - `type` (string): The [SponsorBlock action type](https://wiki.sponsor.ajay.app/w/Types#Action_Type) of the chapter Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. E.g. for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `yt-dlp test video` and id `BaW_jenozKc`, this will result in a `yt-dlp test video-BaW_jenozKc.mp4` file created in the current directory. diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index befff0e1f2..bb15eb7096 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -78,6 +78,7 @@ def to_chapter(s): 'end_time': end, 'category': cat, 'title': title, + 'type': s['actionType'], '_categories': [(cat, start, end, title)], } From 8fab23301c79a927592dda710a60903423beffbb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 18 Oct 2022 22:58:49 +0530 Subject: [PATCH 278/284] [SponsorBlock] Obey `--retry-sleep extractor` --- yt_dlp/postprocessor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index 44feda4278..537792b07f 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -195,9 +195,9 @@ def report_progress(self, s): def _retry_download(self, err, count, retries): # While this is not an extractor, it behaves similar to one and - # so obey extractor_retries and sleep_interval_requests + # so obey extractor_retries and "--retry-sleep extractor" RetryManager.report_retry(err, count, retries, info=self.to_screen, warn=self.report_warning, - sleep_func=self.get_param('sleep_interval_requests')) + sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) def _download_json(self, url, *, expected_http_errors=(404,)): self.write_debug(f'{self.PP_NAME} query: {url}') From a7ddbc0475db14d5249a312e4e03aaf0adc82647 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 18 Oct 2022 23:00:27 +0530 Subject: [PATCH 279/284] [ModifyChapters] Handle the entire video being marked for removal Closes #5238 --- yt_dlp/postprocessor/modify_chapters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py index b2b1acca40..a745b4524c 100644 --- a/yt_dlp/postprocessor/modify_chapters.py +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -37,6 +37,9 @@ def run(self, info): info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters) if not cuts: return [], info + elif not info['chapters']: + self.report_warning('You have requested to remove the entire video, which is not possible') + return [], info original_duration, info['duration'] = info.get('duration'), info['chapters'][-1]['end_time'] if self._duration_mismatch(real_duration, original_duration, 1): From 73ac0e6b857ca138481594cb24d9532ba2714a02 Mon Sep 17 00:00:00 2001 From: jahway603 <64485701+jahway603@users.noreply.github.com> Date: Tue, 18 Oct 2022 13:55:52 -0400 Subject: [PATCH 280/284] [docs, devscripts] Document `pyinst`'s argument passthrough (#5235) Closes #4631 Authored by: jahway603 --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5890004565..a306b199e9 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,8 @@ ### Standalone PyInstaller Builds On some systems, you may need to use `py` or `python` instead of `python3`. +`pyinst.py` accepts any arguments that can be passed to `pyinstaller`, such as `--onefile/-F` or `--onedir/-D`, which is further [documented here](https://pyinstaller.org/en/stable/usage.html#what-to-generate). + Note that pyinstaller with versions below 4.4 [do not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment. **Important**: Running `pyinstaller` directly **without** using `pyinst.py` is **not** officially supported. This may or may not work correctly. From cd5df121f3577178cb73bafe886677da9452dc42 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 18 Oct 2022 23:19:25 +0530 Subject: [PATCH 281/284] [SponsorBlock] Relax duration check for large segments --- yt_dlp/postprocessor/sponsorblock.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index bb15eb7096..188eb059aa 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -64,7 +64,8 @@ def duration_filter(s): if duration and duration - start_end[1] <= 1: start_end[1] = duration # SponsorBlock duration may be absent or it may deviate from the real one. - return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1 + diff = abs(duration - s['videoDuration']) if s['videoDuration'] else 0 + return diff < 1 or (diff < 5 and diff / (start_end[1] - start_end[0]) < 0.05) duration_match = [s for s in segments if duration_filter(s)] if len(duration_match) != len(segments): From d5d1df8afdd532cc889f9d95be0740668a0776fe Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 18 Oct 2022 23:28:57 +0530 Subject: [PATCH 282/284] [cleanup Misc Closes #5162 --- README.md | 2 +- yt_dlp/YoutubeDL.py | 4 ++-- yt_dlp/__init__.py | 2 ++ yt_dlp/__main__.py | 1 - yt_dlp/downloader/common.py | 10 +++++----- yt_dlp/extractor/common.py | 4 +++- yt_dlp/extractor/generic.py | 8 +++----- yt_dlp/extractor/prankcast.py | 17 +++++++++++++++++ yt_dlp/extractor/tv24ua.py | 7 +------ yt_dlp/extractor/youtube.py | 15 ++++++++++----- yt_dlp/postprocessor/sponsorblock.py | 2 +- yt_dlp/utils.py | 8 +++----- 12 files changed, 48 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index a306b199e9..4f731785d3 100644 --- a/README.md +++ b/README.md @@ -1193,7 +1193,7 @@ # OUTPUT TEMPLATE 1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, yt-dlp additionally supports converting to `B` = **B**ytes, `j` = **j**son (flag `#` for pretty-printing, `+` for Unicode), `h` = HTML escaping, `l` = a comma separated **l**ist (flag `#` for `\n` newline-separated), `q` = a string **q**uoted for the terminal (flag `#` to split a list into different arguments), `D` = add **D**ecimal suffixes (e.g. 10M) (flag `#` to use 1024 as factor), and `S` = **S**anitize as filename (flag `#` for restricted) -1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. E.g. `%(title)+.100U` is NFKC +1. **Unicode normalization**: The format type `U` can be used for NFC [Unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. E.g. `%(title)+.100U` is NFKC To summarize, the general syntax for a field is: ``` diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 13725cddc3..42780e7941 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -548,7 +548,7 @@ class YoutubeDL: # NB: Keep in sync with the docstring of extractor/common.py 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', - 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', @@ -3586,7 +3586,7 @@ def render_formats_table(self, info_dict): format_field(f, 'ext'), self.format_resolution(f), self._format_note(f) - ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] + ] for f in formats if (f.get('preference') or 0) >= -1000] return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1) def simplified_codec(f, field): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 9382ff43ba..726fb0685c 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -962,6 +962,8 @@ def _real_main(argv=None): def main(argv=None): + global _IN_CLI + _IN_CLI = True try: _exit(*variadic(_real_main(argv))) except DownloadError: diff --git a/yt_dlp/__main__.py b/yt_dlp/__main__.py index 895918c272..ff5d71d3c9 100644 --- a/yt_dlp/__main__.py +++ b/yt_dlp/__main__.py @@ -14,5 +14,4 @@ import yt_dlp if __name__ == '__main__': - yt_dlp._IN_CLI = True yt_dlp.main() diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 221b3827c7..8d110c3747 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -333,7 +333,7 @@ def with_fields(*tups, default=''): return tmpl return default - _formats_bytes = lambda k: f'{format_bytes(s.get(k)):>10s}' + _format_bytes = lambda k: f'{format_bytes(s.get(k)):>10s}' if s['status'] == 'finished': if self.params.get('noprogress'): @@ -342,7 +342,7 @@ def with_fields(*tups, default=''): s.update({ 'speed': speed, '_speed_str': self.format_speed(speed).strip(), - '_total_bytes_str': _formats_bytes('total_bytes'), + '_total_bytes_str': _format_bytes('total_bytes'), '_elapsed_str': self.format_seconds(s.get('elapsed')), '_percent_str': self.format_percent(100), }) @@ -363,9 +363,9 @@ def with_fields(*tups, default=''): lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], lambda: s['downloaded_bytes'] == 0 and 0)), - '_total_bytes_str': _formats_bytes('total_bytes'), - '_total_bytes_estimate_str': _formats_bytes('total_bytes_estimate'), - '_downloaded_bytes_str': _formats_bytes('downloaded_bytes'), + '_total_bytes_str': _format_bytes('total_bytes'), + '_total_bytes_estimate_str': _format_bytes('total_bytes_estimate'), + '_downloaded_bytes_str': _format_bytes('downloaded_bytes'), '_elapsed_str': self.format_seconds(s.get('elapsed')), }) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ab8def57da..ec3fb58e56 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1108,7 +1108,9 @@ def get_param(self, name, default=None, *args, **kwargs): return self._downloader.params.get(name, default, *args, **kwargs) return default - def report_drm(self, video_id, partial=False): + def report_drm(self, video_id, partial=NO_DEFAULT): + if partial is not NO_DEFAULT: + self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial') self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id) def report_extraction(self, id_or_name): diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index b7a5ffb5b1..5abde33a91 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -32,6 +32,7 @@ unified_timestamp, unsmuggle_url, url_or_none, + variadic, xpath_attr, xpath_text, xpath_with_ns, @@ -2820,11 +2821,8 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): webpage) if mobj is not None: varname = mobj.group(1) - sources = self._parse_json( - mobj.group(2), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] + sources = variadic(self._parse_json( + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) formats = [] subtitles = {} for source in sources: diff --git a/yt_dlp/extractor/prankcast.py b/yt_dlp/extractor/prankcast.py index 7446caf3c0..0eb5f98d19 100644 --- a/yt_dlp/extractor/prankcast.py +++ b/yt_dlp/extractor/prankcast.py @@ -21,6 +21,23 @@ class PrankCastIE(InfoExtractor): 'tags': ['prank call', 'prank'], 'upload_date': '20220825' } + }, { + 'url': 'https://prankcast.com/phonelosers/showreel/2048-NOT-COOL', + 'info_dict': { + 'id': '2048', + 'ext': 'mp3', + 'title': 'NOT COOL', + 'display_id': 'NOT-COOL', + 'timestamp': 1665028364, + 'uploader': 'phonelosers', + 'channel_id': 6, + 'duration': 4044, + 'cast': ['phonelosers'], + 'description': '', + 'categories': ['prank'], + 'tags': ['prank call', 'prank'], + 'upload_date': '20221006' + } }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tv24ua.py b/yt_dlp/extractor/tv24ua.py index 553a70b6b2..2f2571df76 100644 --- a/yt_dlp/extractor/tv24ua.py +++ b/yt_dlp/extractor/tv24ua.py @@ -1,12 +1,7 @@ import re from .common import InfoExtractor -from ..utils import ( - determine_ext, - js_to_json, - mimetype2ext, - traverse_obj, -) +from ..utils import determine_ext, js_to_json, mimetype2ext, traverse_obj class TV24UAVideoIE(InfoExtractor): diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 857c9670c5..a12e5b03e7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1721,7 +1721,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'playable_in_embed': True, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -1754,7 +1755,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -2019,7 +2021,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 522, 'channel': 'kudvenkat', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -2169,7 +2172,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'format': '17', # 3gp format available on android @@ -2213,7 +2217,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 248, 'categories': ['Education'], 'age_limit': 0, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 188eb059aa..6ba87cd672 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -85,7 +85,7 @@ def to_chapter(s): sponsor_chapters = [to_chapter(s) for s in duration_match] if not sponsor_chapters: - self.to_screen('No segments were found in the SponsorBlock database') + self.to_screen('No matching segments were found in the SponsorBlock database') else: self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database') return sponsor_chapters diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index adb7c0e8c5..1e2342f3e9 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5724,7 +5724,7 @@ def parse_args(self): return self.parser.parse_args(self.all_args) -class WebSocketsWrapper(): +class WebSocketsWrapper: """Wraps websockets module to use in non-async scopes""" pool = None @@ -5808,11 +5808,9 @@ def cached_method(f): def wrapper(self, *args, **kwargs): bound_args = signature.bind(self, *args, **kwargs) bound_args.apply_defaults() - key = tuple(bound_args.arguments.values()) + key = tuple(bound_args.arguments.values())[1:] - if not hasattr(self, '__cached_method__cache'): - self.__cached_method__cache = {} - cache = self.__cached_method__cache.setdefault(f.__name__, {}) + cache = vars(self).setdefault('__cached_method__cache', {}).setdefault(f.__name__, {}) if key not in cache: cache[key] = f(self, *args, **kwargs) return cache[key] From 5318156f1c6e9567b7d44910d3301ca4cc876784 Mon Sep 17 00:00:00 2001 From: bsun0000 Date: Wed, 19 Oct 2022 00:05:54 +0530 Subject: [PATCH 283/284] [extractor/youtube] Mark videos as fully watched Closes #2555 Authored by: bsun0000 --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a12e5b03e7..e894f74cdb 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2955,7 +2955,7 @@ def _mark_watched(self, video_id, player_responses): # these seem to mark watchtime "history" in the real world # they're required, so send in a single value qs.update({ - 'st': video_length, + 'st': 0, 'et': video_length, }) From a4713ba96d8b4905e9e8c37fb3b0c1826ae28e25 Mon Sep 17 00:00:00 2001 From: Anant Murmu Date: Wed, 19 Oct 2022 12:25:28 +0530 Subject: [PATCH 284/284] [extractor/voot] Improve `_VALID_URL` (#5283) Authored by: freezboltz --- yt_dlp/extractor/voot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/voot.py b/yt_dlp/extractor/voot.py index 7ac38a813a..173556e664 100644 --- a/yt_dlp/extractor/voot.py +++ b/yt_dlp/extractor/voot.py @@ -14,7 +14,7 @@ class VootIE(InfoExtractor): voot:| https?://(?:www\.)?voot\.com/? (?: - movies/[^/]+/| + movies?/[^/]+/| (?:shows|kids)/(?:[^/]+/){4} ) ) @@ -47,6 +47,9 @@ class VootIE(InfoExtractor): }, { 'url': 'https://www.voot.com/movies/pandavas-5/424627', 'only_matching': True, + }, { + 'url': 'https://www.voot.com/movie/fight-club/621842', + 'only_matching': True, }] def _real_extract(self, url):