From 46c1b7cfec1d0e6155083ca7e6948674c64ecb97 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:13:08 -0500 Subject: [PATCH 01/48] [build] Cache dependencies for `macos` job (#10088) Authored by: bashonly --- .github/workflows/build.yml | 50 +++++++++++++++++++++++---- .github/workflows/release-master.yml | 3 +- .github/workflows/release-nightly.yml | 3 +- .github/workflows/release.yml | 3 +- 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9a1a22e8f5..1adb62dfb1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -237,27 +237,43 @@ jobs: macos: needs: process if: inputs.macos + permissions: + contents: read + actions: write # For cleaning up cache runs-on: macos-12 steps: - uses: actions/checkout@v4 # NB: Building universal2 does not work with python from actions/setup-python + + - name: Restore cached requirements + id: restore-cache + uses: actions/cache/restore@v4 + env: + SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1 + with: + path: | + ~/yt-dlp-build-venv + key: cache-reqs-${{ github.job }} + - name: Install Requirements run: | brew install coreutils - python3 devscripts/install_deps.py --user -o --include build + python3 -m venv ~/yt-dlp-build-venv + source ~/yt-dlp-build-venv/bin/activate + python3 devscripts/install_deps.py -o --include build python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt # We need to ignore wheels otherwise we break universal2 builds - python3 -m pip install -U --user --no-binary :all: -r requirements.txt + python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U --user delocate + python3 -m pip install -U delocate mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do python3 -m pip download \ --only-binary=:all: \ --platform "${platform}" \ - --pre -d curl_cffi_whls \ + -d curl_cffi_whls \ -r requirements.txt done ( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite @@ -274,9 +290,10 @@ jobs: ) python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2 python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2 - cd curl_cffi_universal2 - for wheel in ./*cffi*.whl; do mv -n -- "${wheel}" "${wheel/x86_64/universal2}"; done - python3 -m pip install -U --user ./*cffi*.whl + for wheel in curl_cffi_universal2/*cffi*.whl; do + mv -n -- "${wheel}" "${wheel/x86_64/universal2}" + done + python3 -m pip install --force-reinstall -U curl_cffi_universal2/*cffi*.whl - name: Prepare run: | @@ -284,6 +301,7 @@ jobs: python3 devscripts/make_lazy_extractors.py - name: Build run: | + source ~/yt-dlp-build-venv/bin/activate python3 -m bundle.pyinstaller --target-architecture universal2 --onedir (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) python3 -m bundle.pyinstaller --target-architecture universal2 @@ -307,6 +325,24 @@ jobs: dist/yt-dlp_macos.zip compression-level: 0 + - name: Cleanup cache + if: steps.restore-cache.outputs.cache-hit == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + cache_key: cache-reqs-${{ github.job }} + repository: ${{ github.repository }} + branch: ${{ github.ref }} + run: | + gh extension install actions/gh-actions-cache + gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm + + - name: Cache requirements + uses: actions/cache/save@v4 + with: + path: | + ~/yt-dlp-build-venv + key: cache-reqs-${{ github.job }} + macos_legacy: needs: process if: inputs.macos_legacy diff --git a/.github/workflows/release-master.yml b/.github/workflows/release-master.yml index a84547580b..c49319b171 100644 --- a/.github/workflows/release-master.yml +++ b/.github/workflows/release-master.yml @@ -24,6 +24,7 @@ jobs: source: master permissions: contents: write - packages: write + packages: write # For package cache + actions: write # For cleaning up cache id-token: write # mandatory for trusted publishing secrets: inherit diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index f459a3a17e..b536c50669 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -37,6 +37,7 @@ jobs: source: nightly permissions: contents: write - packages: write + packages: write # For package cache + actions: write # For cleaning up cache id-token: write # mandatory for trusted publishing secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 32268b32f3..fa5ad7e515 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -228,7 +228,8 @@ jobs: origin: ${{ needs.prepare.outputs.target_repo }} permissions: contents: read - packages: write # For package cache + packages: write # For package cache + actions: write # For cleaning up cache secrets: GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} From d7d861811c15585a4f7ec9d5ae68d2ac28de28a0 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:59:17 -0500 Subject: [PATCH 02/48] [ie/tubitv:series] Fix extractor (#10116) Closes #8563 Authored by: bashonly --- yt_dlp/extractor/tubitv.py | 57 +++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index 9d9ddae720..85eb3a211c 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -13,6 +13,7 @@ class TubiTvIE(InfoExtractor): + IE_NAME = 'tubitv' _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?Pvideo|movies|tv-shows)/(?P\d+)' _LOGIN_URL = 'http://tubitv.com/login' _NETRC_MACHINE = 'tubitv' @@ -148,30 +149,54 @@ def _real_extract(self, url): class TubiTvShowIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P[^/?#]+)' + IE_NAME = 'tubitv:series' + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/\d+/(?P[^/?#]+)(?:/season-(?P\d+))?' _TESTS = [{ 'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true', - 'playlist_mincount': 390, + 'playlist_mincount': 389, 'info_dict': { 'id': 'the-joy-of-painting-with-bob-ross', }, + }, { + 'url': 'https://tubitv.com/series/2311/the-saddle-club/season-1', + 'playlist_count': 26, + 'info_dict': { + 'id': 'the-saddle-club-season-1', + }, + }, { + 'url': 'https://tubitv.com/series/2311/the-saddle-club/season-3', + 'playlist_count': 19, + 'info_dict': { + 'id': 'the-saddle-club-season-3', + }, + }, { + 'url': 'https://tubitv.com/series/2311/the-saddle-club/', + 'playlist_mincount': 71, + 'info_dict': { + 'id': 'the-saddle-club', + }, }] - def _entries(self, show_url, show_name): - show_webpage = self._download_webpage(show_url, show_name) + def _entries(self, show_url, playlist_id, selected_season): + webpage = self._download_webpage(show_url, playlist_id) - show_json = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({[^<]+});\s*', - show_webpage, 'data'), show_name, transform_source=js_to_json)['video'] + data = self._search_json( + r'window\.__data\s*=', webpage, 'data', playlist_id, + transform_source=js_to_json)['video'] - for episode_id in show_json['fullContentById']: - if traverse_obj(show_json, ('byId', episode_id, 'type')) == 's': - continue - yield self.url_result( - f'https://tubitv.com/tv-shows/{episode_id}/', - ie=TubiTvIE.ie_key(), video_id=episode_id) + # v['number'] is already a decimal string, but stringify to protect against API changes + path = [lambda _, v: str(v['number']) == selected_season] if selected_season else [..., {dict}] + + for season in traverse_obj(data, ('byId', lambda _, v: v['type'] == 's', 'seasons', *path)): + season_number = int_or_none(season.get('number')) + for episode in traverse_obj(season, ('episodes', lambda _, v: v['id'])): + episode_id = episode['id'] + yield self.url_result( + f'https://tubitv.com/tv-shows/{episode_id}/', TubiTvIE, episode_id, + season_number=season_number, episode_number=int_or_none(episode.get('num'))) def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - return self.playlist_result(self._entries(url, show_name), playlist_id=show_name) + playlist_id, selected_season = self._match_valid_url(url).group('show_name', 'season') + if selected_season: + playlist_id = f'{playlist_id}-season-{selected_season}' + return self.playlist_result(self._entries(url, playlist_id, selected_season), playlist_id) From 081708d6074dfbb907e25af61ba530bba0d4b31d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 17:31:13 -0500 Subject: [PATCH 03/48] [ie/francetv] Fix extractor (#10177) Closes #10175 Authored by: bashonly --- yt_dlp/extractor/francetv.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index f732d56772..de2bec25ac 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -33,6 +33,7 @@ class FranceTVIE(InfoExtractor): _GEO_BYPASS = False _TESTS = [{ + # tokenized url is in dinfo['video']['token'] 'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1', 'info_dict': { 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', @@ -44,6 +45,19 @@ class FranceTVIE(InfoExtractor): 'upload_date': '20170813', }, 'params': {'skip_download': 'm3u8'}, + }, { + # tokenized url is in dinfo['video']['token']['akamai'] + 'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba', + 'info_dict': { + 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'timestamp': 1514118300, + 'duration': 2880, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20171224', + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'francetv:162311093', 'only_matching': True, @@ -119,7 +133,7 @@ def _extract_video(self, video_id, hostname=None): video_url = video['url'] format_id = video.get('format') - if token_url := url_or_none(video.get('token')): + if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)): tokenized_url = traverse_obj(self._download_json( token_url, video_id, f'Downloading signed {format_id} manifest URL', fatal=False, query={ @@ -225,13 +239,13 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', + 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'timestamp': 1502623500, - 'duration': 2580, + 'timestamp': 1514118300, + 'duration': 2880, 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20170813', + 'upload_date': '20171224', }, 'params': { 'skip_download': True, From 3690c2f59827c79a1bbe388a7c1ae75db7477db2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 17:44:20 -0500 Subject: [PATCH 04/48] [ie/francetv] Detect and raise errors for DRM (#10165) Closes #10163 Authored by: bashonly --- yt_dlp/extractor/francetv.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index de2bec25ac..ab08f1c6bf 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -5,6 +5,7 @@ from .dailymotion import DailymotionIE from ..networking import HEADRequest from ..utils import ( + clean_html, determine_ext, filter_dict, format_field, @@ -82,6 +83,7 @@ class FranceTVIE(InfoExtractor): def _extract_video(self, video_id, hostname=None): is_live = None videos = [] + drm_formats = False title = None subtitle = None episode_number = None @@ -99,13 +101,12 @@ def _extract_video(self, video_id, hostname=None): 'device_type': device_type, 'browser': browser, 'domain': hostname, - }), fatal=False) + }), fatal=False, expected_status=422) # 422 json gives detailed error code/message if not dinfo: continue - video = traverse_obj(dinfo, ('video', {dict})) - if video: + if video := traverse_obj(dinfo, ('video', {dict})): videos.append(video) if duration is None: duration = video.get('duration') @@ -113,9 +114,19 @@ def _extract_video(self, video_id, hostname=None): is_live = video.get('is_live') if spritesheets is None: spritesheets = video.get('spritesheets') + elif code := traverse_obj(dinfo, ('code', {int})): + if code == 2009: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif code in (2015, 2017): + # 2015: L'accès à cette vidéo est impossible. (DRM-only) + # 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM) + drm_formats = True + continue + self.report_warning( + f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"') + continue - meta = traverse_obj(dinfo, ('meta', {dict})) - if meta: + if meta := traverse_obj(dinfo, ('meta', {dict})): if title is None: title = meta.get('title') # meta['pre_title'] contains season and episode number for series in format "S E" @@ -128,6 +139,9 @@ def _extract_video(self, video_id, hostname=None): if timestamp is None: timestamp = parse_iso8601(meta.get('broadcasted_at')) + if not videos and drm_formats: + self.report_drm(video_id) + formats, subtitles, video_url = [], {}, None for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])): video_url = video['url'] From 92a1c4abaeeba9a69d611c57b73555cb1a1f00ad Mon Sep 17 00:00:00 2001 From: JSubelj Date: Fri, 14 Jun 2024 00:51:12 +0200 Subject: [PATCH 05/48] [ie/rtvslo.si:show] Add extractor (#8418) Authored by: JSubelj, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/rtvslo.py | 160 ++++++++++++++++++-------------- 2 files changed, 96 insertions(+), 69 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e9cd38a651..0f599c9db7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1755,7 +1755,10 @@ RTVETelevisionIE, ) from .rtvs import RTVSIE -from .rtvslo import RTVSLOIE +from .rtvslo import ( + RTVSLOIE, + RTVSLOShowIE, +) from .rudovideo import RudoVideoIE from .rule34video import Rule34VideoIE from .rumble import ( diff --git a/yt_dlp/extractor/rtvslo.py b/yt_dlp/extractor/rtvslo.py index e71d01d1e0..9c2e6fb6b5 100644 --- a/yt_dlp/extractor/rtvslo.py +++ b/yt_dlp/extractor/rtvslo.py @@ -1,3 +1,5 @@ +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -6,6 +8,7 @@ traverse_obj, unified_timestamp, url_or_none, + urljoin, ) @@ -21,75 +24,73 @@ class RTVSLOIE(InfoExtractor): _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622' SUB_LANGS_MAP = {'Slovenski': 'sl'} - _TESTS = [ - { - 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', - 'info_dict': { - 'id': '174842550', - 'ext': 'mp4', - 'release_timestamp': 1643140032, - 'upload_date': '20220125', - 'series': 'Dnevnik', - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg', - 'description': 'md5:76a18692757aeb8f0f51221106277dd2', - 'timestamp': 1643137046, - 'title': 'Dnevnik', - 'series_id': '92', - 'release_date': '20220125', - 'duration': 1789, - }, - }, { - 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754', - 'info_dict': { - 'id': '174843754', - 'ext': 'mp4', - 'series_id': '94', - 'release_date': '20220129', - 'timestamp': 1643484455, - 'title': 'Utrip', - 'duration': 813, - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg', - 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9', - 'release_timestamp': 1643485825, - 'upload_date': '20220129', - 'series': 'Utrip', - }, - }, { - 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609', - 'info_dict': { - 'id': '174844609', - 'ext': 'mp3', - 'series_id': '106615841', - 'title': 'Il giornale della sera', - 'duration': 1328, - 'series': 'Il giornale della sera', - 'timestamp': 1643743800, - 'release_timestamp': 1643745424, - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg', - 'upload_date': '20220201', - 'tbr': 128000, - 'release_date': '20220201', - }, - }, { - 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', - 'info_dict': { - 'id': '148350750', - 'ext': 'mp4', - 'title': 'Prvi šolski dan, mozaična oddaja za mlade', - 'series': 'Razred zase', - 'series_id': '148185730', - 'duration': 1481, - 'upload_date': '20121019', - 'timestamp': 1350672122, - 'release_date': '20121019', - 'release_timestamp': 1350672122, - 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', - }, - }, { - 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', - 'only_matching': True, + _TESTS = [{ + 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', + 'info_dict': { + 'id': '174842550', + 'ext': 'mp4', + 'release_timestamp': 1643140032, + 'upload_date': '20220125', + 'series': 'Dnevnik', + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg', + 'description': 'md5:76a18692757aeb8f0f51221106277dd2', + 'timestamp': 1643137046, + 'title': 'Dnevnik', + 'series_id': '92', + 'release_date': '20220125', + 'duration': 1789, }, - ] + }, { + 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754', + 'info_dict': { + 'id': '174843754', + 'ext': 'mp4', + 'series_id': '94', + 'release_date': '20220129', + 'timestamp': 1643484455, + 'title': 'Utrip', + 'duration': 813, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg', + 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9', + 'release_timestamp': 1643485825, + 'upload_date': '20220129', + 'series': 'Utrip', + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609', + 'info_dict': { + 'id': '174844609', + 'ext': 'mp3', + 'series_id': '106615841', + 'title': 'Il giornale della sera', + 'duration': 1328, + 'series': 'Il giornale della sera', + 'timestamp': 1643743800, + 'release_timestamp': 1643745424, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg', + 'upload_date': '20220201', + 'tbr': 128000, + 'release_date': '20220201', + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', + 'info_dict': { + 'id': '148350750', + 'ext': 'mp4', + 'title': 'Prvi šolski dan, mozaična oddaja za mlade', + 'series': 'Razred zase', + 'series_id': '148185730', + 'duration': 1481, + 'upload_date': '20121019', + 'timestamp': 1350672122, + 'release_date': '20121019', + 'release_timestamp': 1350672122, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', + }, + }, { + 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', + 'only_matching': True, + }] def _real_extract(self, url): v_id = self._match_id(url) @@ -164,3 +165,26 @@ def _real_extract(self, url): 'series': meta.get('showName'), 'series_id': meta.get('showId'), } + + +class RTVSLOShowIE(InfoExtractor): + IE_NAME = 'rtvslo.si:show' + _VALID_URL = r'https?://(?:365|4d)\.rtvslo.si/oddaja/[^/?#&]+/(?P\d+)' + + _TESTS = [{ + 'url': 'https://365.rtvslo.si/oddaja/ekipa-bled/173250997', + 'info_dict': { + 'id': '173250997', + 'title': 'Ekipa Bled', + }, + 'playlist_count': 18, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_from_matches( + re.findall(r']*\bhref="(/arhiv/[^"]+)"', webpage), + playlist_id, self._html_extract_title(webpage), + getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE) From e53e56b73543799638fa6abb0c78f8b091aa84e1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 18:01:19 -0500 Subject: [PATCH 06/48] [ie/soundcloud] Fix `download` format extraction (#10125) Authored by: bashonly --- yt_dlp/extractor/soundcloud.py | 52 +++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 0f73684355..0c6f0b070a 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -95,7 +95,7 @@ def _update_client_id(self): return raise ExtractorError('Unable to extract client id') - def _download_json(self, *args, **kwargs): + def _call_api(self, *args, **kwargs): non_fatal = kwargs.get('fatal') is False if non_fatal: del kwargs['fatal'] @@ -104,7 +104,7 @@ def _download_json(self, *args, **kwargs): query['client_id'] = self._CLIENT_ID kwargs['query'] = query try: - return super()._download_json(*args, **kwargs) + return self._download_json(*args, **kwargs) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): self._store_client_id(None) @@ -163,7 +163,7 @@ def genNumBlock(): 'user_agent': self._USER_AGENT } - response = self._download_json( + response = self._call_api( self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), None, note='Verifying login token...', fatal=False, data=json.dumps(payload).encode()) @@ -217,12 +217,26 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f query['secret_token'] = secret_token if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'): - download_url = update_url_query( - self._API_V2_BASE + 'tracks/' + track_id + '/download', query) - redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') - if redirect_url: + try: + # Do not use _call_api(); HTTP Error codes have different meanings for this request + download_data = self._download_json( + f'{self._API_V2_BASE}tracks/{track_id}/download', track_id, + 'Downloading original download format info JSON', query=query, headers=self._HEADERS) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.report_warning( + 'Original download format is only available ' + f'for registered users. {self._login_hint()}') + elif isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.write_debug('Original download format is not available for this client') + else: + self.report_warning(e.msg) + download_data = None + + if redirect_url := traverse_obj(download_data, ('redirectUri', {url_or_none})): urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, 'Checking for original download format', fatal=False) + HEADRequest(redirect_url), track_id, 'Checking original download format availability', + 'Original download format is not available', fatal=False) if urlh: format_url = urlh.url format_urls.add(format_url) @@ -303,7 +317,7 @@ def add_format(f, protocol, is_preview=False): stream = None for retry in self.RetryManager(fatal=False): try: - stream = self._download_json( + stream = self._call_api( format_url, track_id, f'Downloading {identifier} format info JSON', query=query, headers=self._HEADERS) except ExtractorError as e: @@ -630,7 +644,7 @@ def _real_extract(self, url): resolve_title += f'/{token}' info_json_url = self._resolv_url(self._BASE_URL + resolve_title) - info = self._download_json( + info = self._call_api( info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) return self._extract_info_dict(info, full_title, token) @@ -641,7 +655,7 @@ def _extract_set(self, playlist, token=None): playlist_id = str(playlist['id']) tracks = playlist.get('tracks') or [] if not all(t.get('permalink_url') for t in tracks) and token: - tracks = self._download_json( + tracks = self._call_api( self._API_V2_BASE + 'tracks', playlist_id, 'Downloading tracks', query={ 'ids': ','.join([str(t['id']) for t in tracks]), @@ -699,7 +713,7 @@ def _real_extract(self, url): if token: full_title += '/' + token - info = self._download_json(self._resolv_url( + info = self._call_api(self._resolv_url( self._BASE_URL + full_title), full_title, headers=self._HEADERS) if 'errors' in info: @@ -730,7 +744,7 @@ def _entries(self, url, playlist_id): for i in itertools.count(): for retry in self.RetryManager(): try: - response = self._download_json( + response = self._call_api( url, playlist_id, query=query, headers=self._HEADERS, note=f'Downloading track page {i + 1}') break @@ -838,7 +852,7 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) uploader = mobj.group('user') - user = self._download_json( + user = self._call_api( self._resolv_url(self._BASE_URL + uploader), uploader, 'Downloading user info', headers=self._HEADERS) @@ -864,7 +878,7 @@ class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): user_id = self._match_id(url) - user = self._download_json( + user = self._call_api( self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS) return self._extract_playlist( @@ -886,7 +900,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): track_name = self._match_id(url) - track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) + track = self._call_api(self._resolv_url(url), track_name, headers=self._HEADERS) track_id = self._search_regex( r'soundcloud:track-stations:(\d+)', track['id'], 'track id') @@ -930,7 +944,7 @@ class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): slug, relation = self._match_valid_url(url).group('slug', 'relation') - track = self._download_json( + track = self._call_api( self._resolv_url(self._BASE_URL + slug), slug, 'Downloading track info', headers=self._HEADERS) @@ -965,7 +979,7 @@ def _real_extract(self, url): if token: query['secret_token'] = token - data = self._download_json( + data = self._call_api( self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) @@ -1000,7 +1014,7 @@ def _get_collection(self, endpoint, collection_id, **query): next_url = update_url_query(self._API_V2_BASE + endpoint, query) for i in itertools.count(1): - response = self._download_json( + response = self._call_api( next_url, collection_id, f'Downloading page {i}', 'Unable to download API page', headers=self._HEADERS) From b8e2a5e0e1030076f833917906e19bb6c7b318f6 Mon Sep 17 00:00:00 2001 From: garret1317 Date: Fri, 14 Jun 2024 00:08:40 +0100 Subject: [PATCH 07/48] [ie/NHKRadiru] Fix extractor (#10106) Closes #10105 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 240 ++++++++++++++++++++++++++++------------ 1 file changed, 171 insertions(+), 69 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 0ff25a6909..0bd6edfcba 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -4,6 +4,7 @@ from ..utils import ( ExtractorError, clean_html, + filter_dict, get_element_by_class, int_or_none, join_nonempty, @@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor): IE_DESC = 'NHK らじる (Radiru/Rajiru)' _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P[\da-zA-Z]+)_(?P[\da-zA-Z]+)(?:_(?P[\da-zA-Z]+))?' _TESTS = [{ - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210', - 'skip': 'Episode expired on 2024-02-24', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239', + 'skip': 'Episode expired on 2024-06-09', 'info_dict': { - 'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス', - 'id': '0449_01_3926210', + 'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集', + 'id': '0449_01_4003239', 'ext': 'm4a', + 'uploader': 'NHK FM 東京', + 'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc', 'series': 'ジャズ・トゥナイト', - 'uploader': 'NHK-FM', - 'channel': 'NHK-FM', + 'channel': 'NHK FM 東京', 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', - 'release_date': '20240217', - 'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811', - 'timestamp': 1708185600, - 'release_timestamp': 1708178400, - 'upload_date': '20240217', + 'upload_date': '20240601', + 'series_id': '0449_01', + 'release_date': '20240601', + 'timestamp': 1717257600, + 'release_timestamp': 1717250400, }, }, { # playlist, airs every weekday so it should _hopefully_ be okay forever @@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor): 'id': '0458_01', 'title': 'ベストオブクラシック', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', - 'channel': 'NHK-FM', - 'uploader': 'NHK-FM', 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', + 'series_id': '0458_01', + 'uploader': 'NHK FM', + 'channel': 'NHK FM', + 'series': 'ベストオブクラシック', }, 'playlist_mincount': 3, }, { # one with letters in the id - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470', - 'note': 'Expires on 2024-03-31', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688', + 'note': 'Expires on 2025-03-31', 'info_dict': { - 'id': 'F300_06_3738470', + 'id': 'F683_01_3910688', 'ext': 'm4a', - 'title': '有島武郎「一房のぶどう」', - 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)', - 'channel': 'NHKラジオ第1、NHK-FM', - 'uploader': 'NHKラジオ第1、NHK-FM', - 'timestamp': 1635757200, - 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', - 'release_date': '20161207', - 'series': 'らじる文庫 by ラジオ深夜便 ', - 'release_timestamp': 1481126700, - 'upload_date': '20211101', + 'title': '夏目漱石「文鳥」第1回', + 'series': '【らじる文庫】夏目漱石「文鳥」(全4回)', + 'series_id': 'F683_01', + 'description': '朗読:浅井理アナウンサー', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg', + 'upload_date': '20240106', + 'release_date': '20240106', + 'uploader': 'NHK R1', + 'release_timestamp': 1704511800, + 'channel': 'NHK R1', + 'timestamp': 1704512700, }, - 'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'], + 'expected_warnings': ['Unable to download JSON metadata', + 'Failed to get extended metadata. API returned Error 1: Invalid parameters'], }, { # news - 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109', - 'skip': 'Expires on 2023-04-17', + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173', 'info_dict': { - 'id': 'F261_01_3855109', + 'id': 'F261_01_4012173', 'ext': 'm4a', 'channel': 'NHKラジオ第1', 'uploader': 'NHKラジオ第1', - 'timestamp': 1681635900, - 'release_date': '20230416', 'series': 'NHKラジオニュース', - 'title': '午後6時のNHKニュース', + 'title': '午前0時のNHKニュース', 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', - 'upload_date': '20230416', - 'release_timestamp': 1681635600, + 'release_timestamp': 1718290800, + 'release_date': '20240613', + 'timestamp': 1718291400, + 'upload_date': '20240613', }, + }, { + # fallback when extended metadata fails + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298', + 'skip': 'Expires on 2024-06-07', + 'info_dict': { + 'id': '2834_01_4009298', + 'title': 'まち☆キラ!開成町特集', + 'ext': 'm4a', + 'release_date': '20240531', + 'upload_date': '20240531', + 'series': 'はま☆キラ!', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg', + 'channel': 'NHK R1,FM', + 'description': '', + 'timestamp': 1717123800, + 'uploader': 'NHK R1,FM', + 'release_timestamp': 1717120800, + 'series_id': '2834_01', + }, + 'expected_warnings': ['Failed to get extended metadata. API returned empty list.'], }] _API_URL_TMPL = None - def _extract_extended_description(self, episode_id, episode): - service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')})) - aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str})) + def _extract_extended_metadata(self, episode_id, aa_vinfo): + service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')})) detail_url = try_call( - lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3)) + lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3])) if not detail_url: - return + return {} - full_meta = traverse_obj( - self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False), - ('list', service, 0, {dict})) or {} - return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta) + response = self._download_json( + detail_url, episode_id, 'Downloading extended metadata', + 'Failed to download extended metadata', fatal=False, expected_status=400) + if not response: + return {} - def _extract_episode_info(self, headline, programme_id, series_meta): + if error := traverse_obj(response, ('error', {dict})): + self.report_warning( + 'Failed to get extended metadata. API returned ' + f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}') + return {} + + full_meta = traverse_obj(response, ('list', service, 0, {dict})) + if not full_meta: + self.report_warning('Failed to get extended metadata. API returned empty list.') + return {} + + station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None + thumbnails = [{ + 'id': str(id_), + 'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1, + **traverse_obj(thumb, { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + } for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))] + + return filter_dict({ + 'channel': station, + 'uploader': station, + 'description': join_nonempty( + 'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta), + 'thumbnails': thumbnails, + **traverse_obj(full_meta, { + 'title': ('title', {str}), + 'timestamp': ('end_time', {unified_timestamp}), + 'release_timestamp': ('start_time', {unified_timestamp}), + }), + }) + + def _extract_episode_info(self, episode, programme_id, series_meta): + episode_id = f'{programme_id}_{episode["id"]}' + aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')})) + extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo) + fallback_start_time, _, fallback_end_time = traverse_obj( + aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')})) + + return { + **series_meta, + 'id': episode_id, + 'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False), + 'container': 'm4a_dash', # force fixup, AAC-only HLS + 'was_live': True, + 'title': episode.get('program_title'), + 'description': episode.get('program_sub_title'), # fallback + 'timestamp': unified_timestamp(fallback_end_time), + 'release_timestamp': unified_timestamp(fallback_start_time), + **extended_metadata, + } + + def _extract_news_info(self, headline, programme_id, series_meta): episode_id = f'{programme_id}_{headline["headline_id"]}' episode = traverse_obj(headline, ('file_list', 0, {dict})) - description = self._extract_extended_description(episode_id, episode) - if not description: - self.report_warning('Failed to get extended description, falling back to summary') - description = traverse_obj(episode, ('file_title_sub', {str})) return { **series_meta, @@ -687,9 +763,9 @@ def _extract_episode_info(self, headline, programme_id, series_meta): 'was_live': True, 'series': series_meta.get('title'), 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'), - 'description': description, **traverse_obj(episode, { - 'title': 'file_title', + 'title': ('file_title', {str}), + 'description': ('file_title_sub', {str}), 'timestamp': ('open_time', {unified_timestamp}), 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}), }), @@ -706,32 +782,58 @@ def _real_extract(self, url): site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') programme_id = f'{site_id}_{corner_id}' - if site_id == 'F261': - json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json' - else: - json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json' + if site_id == 'F261': # XXX: News programmes use old API (for now?) + meta = self._download_json( + 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main'] + series_meta = traverse_obj(meta, { + 'title': ('program_name', {str}), + 'channel': ('media_name', {str}), + 'uploader': ('media_name', {str}), + 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), + }, get_all=False) - meta = self._download_json(json_url, programme_id)['main'] + if headline_id: + headline = traverse_obj( + meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any)) + if not headline: + raise ExtractorError('Content not found; it has most likely expired', expected=True) + return self._extract_news_info(headline, programme_id, series_meta) - series_meta = traverse_obj(meta, { - 'title': 'program_name', - 'channel': 'media_name', - 'uploader': 'media_name', - 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), - }, get_all=False) + def news_entries(): + for headline in traverse_obj(meta, ('detail_list', ..., {dict})): + yield self._extract_news_info(headline, programme_id, series_meta) + + return self.playlist_result( + news_entries(), programme_id, description=meta.get('site_detail'), **series_meta) + + meta = self._download_json( + 'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={ + 'site_id': site_id, + 'corner_site_id': corner_id, + }) + + fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ') + series_meta = { + 'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta), + 'series_id': programme_id, + 'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})), + 'channel': fallback_station, + 'uploader': fallback_station, + } if headline_id: - return self._extract_episode_info( - traverse_obj(meta, ( - 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False), - programme_id, series_meta) + episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any)) + if not episode: + raise ExtractorError('Content not found; it has most likely expired', expected=True) + return self._extract_episode_info(episode, programme_id, series_meta) def entries(): - for headline in traverse_obj(meta, ('detail_list', ..., {dict})): - yield self._extract_episode_info(headline, programme_id, series_meta) + for episode in traverse_obj(meta, ('episodes', ..., {dict})): + yield self._extract_episode_info(episode, programme_id, series_meta) return self.playlist_result( - entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta) + entries(), programme_id, title=series_meta.get('series'), + description=meta.get('series_description'), **series_meta) class NhkRadioNewsPageIE(InfoExtractor): From ea88129784fcbb6987161df9ba05909325d8e2e9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 18:16:43 -0500 Subject: [PATCH 08/48] [ie/tiktok] Detect and raise when login is required (#10124) Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index dc74d4a1f5..48934fc6b3 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -213,8 +213,19 @@ def _extract_aweme_app(self, aweme_id): return self._parse_aweme_video_app(aweme_detail) def _extract_web_data_and_status(self, url, video_id, fatal=True): - webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or '' - video_data, status = {}, None + video_data, status = {}, -1 + + res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'}) + if res is False: + return video_data, status + + webpage, urlh = res + if urllib.parse.urlparse(urlh.url).path == '/login': + message = 'TikTok is requiring login for access to this content' + if fatal: + self.raise_login_required(message) + self.report_warning(f'{message}. {self._login_hint()}') + return video_data, status if universal_data := self._get_universal_data(webpage, video_id): self.write_debug('Found universal data for rehydration') From a0d9967f6822fc279e86bce33464194985148727 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 13 Jun 2024 18:22:30 -0500 Subject: [PATCH 09/48] [ie/youtube:tab] Fix channel metadata extraction (#10071) Closes #9893, Closes #10090 Authored by: bashonly, shoxie007 Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com> --- yt_dlp/extractor/youtube.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a227f24258..a89744eb10 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -885,14 +885,14 @@ def _get_count(self, data, *path_list): return count @staticmethod - def _extract_thumbnails(data, *path_list): + def _extract_thumbnails(data, *path_list, final_key='thumbnails'): """ Extract thumbnails from thumbnails dict @param path_list: path list to level that contains 'thumbnails' key """ thumbnails = [] for path in path_list or [()]: - for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)): + for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)): thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -5124,6 +5124,10 @@ def _extract_metadata_from_tabs(self, item_id, data): else: metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) + # pageHeaderViewModel slow rollout began April 2024 + page_header_view_model = traverse_obj(data, ( + 'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict})) + # We can get the uncropped banner/avatar by replacing the crop params with '=s0' # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 def _get_uncropped(url): @@ -5139,8 +5143,10 @@ def _get_uncropped(url): 'preference': 1, }) - channel_banners = self._extract_thumbnails( - data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) + channel_banners = ( + self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) + or self._extract_thumbnails( + page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources')) for banner in channel_banners: banner['preference'] = -10 @@ -5167,7 +5173,11 @@ def _get_uncropped(url): or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or info['id']), 'availability': self._extract_availability(data), - 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), + 'channel_follower_count': ( + self._get_count(data, ('header', ..., 'subscriberCountText')) + or traverse_obj(page_header_view_model, ( + 'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts', + lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))), 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), 'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str})) or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))), From 4093eb1fcc29a0e2aea9adfcba479787d9ae0c0c Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 15 Jun 2024 15:51:27 -0400 Subject: [PATCH 10/48] [ie/khanacademy] Fix extractors (#9136) Closes #8775 Authored by: c-basalt --- yt_dlp/extractor/khanacademy.py | 141 +++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 49 deletions(-) diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 5333036a8b..3f03f9e4c4 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -3,43 +3,52 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + make_archive_id, parse_iso8601, - try_get, + str_or_none, + traverse_obj, + url_or_none, + urljoin, ) class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' + _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70' + def _parse_video(self, video): return { '_type': 'url_transparent', 'url': video['youtubeId'], - 'id': video.get('slug'), - 'title': video.get('title'), - 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'), - 'duration': int_or_none(video.get('duration')), - 'description': video.get('description'), + 'id': video['youtubeId'], 'ie_key': 'Youtube', + **traverse_obj(video, { + 'display_id': ('id', {str_or_none}), + 'title': ('translatedTitle', {str}), + 'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'description': ('description', {str}), + }, get_all=False), } def _real_extract(self, url): display_id = self._match_id(url) content = self._download_json( - 'https://www.khanacademy.org/api/internal/graphql/FetchContentData', - display_id, query={ + 'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id, + query={ 'fastly_cacheable': 'persist_until_publish', - 'hash': '4134764944', - 'lang': 'en', + 'pcv': self._PUBLISHED_CONTENT_VERSION, + 'hash': '1242644265', 'variables': json.dumps({ 'path': display_id, - 'queryParams': 'lang=en', - 'isModal': False, - 'followRedirects': True, 'countryCode': 'US', + 'kaLocale': 'en', + 'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION, }), - })['data']['contentJson'] - return self._parse_component_props(self._parse_json(content, display_id)['componentProps']) + 'lang': 'en', + })['data']['contentRoute']['listedPathData'] + return self._parse_component_props(content, display_id) class KhanAcademyIE(KhanAcademyBaseIE): @@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE): _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') _TEST = { 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', - 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', + 'md5': '1d5c2e70fa6aa29c38eca419f12515ce', 'info_dict': { 'id': 'FlIG3TvQCBQ', 'ext': 'mp4', 'title': 'The one-time pad', 'description': 'The perfect cipher', + 'display_id': '716378217', 'duration': 176, - 'uploader': 'Brit Cruise', - 'uploader_id': 'khanacademy', + 'uploader': 'Khan Academy', + 'uploader_id': '@khanacademy', + 'uploader_url': 'https://www.youtube.com/@khanacademy', 'upload_date': '20120411', 'timestamp': 1334170113, 'license': 'cc-by-nc-sa', + 'live_status': 'not_live', + 'channel': 'Khan Academy', + 'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g', + 'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g', + 'channel_is_verified': True, + 'playable_in_embed': True, + 'categories': ['Education'], + 'creators': ['Brit Cruise'], + 'tags': [], + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': str, + 'view_count': int, + 'like_count': int, + 'heatmap': list, }, 'add_ie': ['Youtube'], } - def _parse_component_props(self, component_props): - video = component_props['tutorialPageData']['contentModel'] - info = self._parse_video(video) - author_names = video.get('authorNames') - info.update({ - 'uploader': ', '.join(author_names) if author_names else None, - 'timestamp': parse_iso8601(video.get('dateAdded')), - 'license': video.get('kaUserLicense'), - }) - return info + def _parse_component_props(self, component_props, display_id): + video = component_props['content'] + return { + **self._parse_video(video), + **traverse_obj(video, { + 'creators': ('authorNames', ..., {str}), + 'timestamp': ('dateAdded', {parse_iso8601}), + 'license': ('kaUserLicense', {str}), + }), + } class KhanAcademyUnitIE(KhanAcademyBaseIE): IE_NAME = 'khanacademy:unit' - _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' - _TEST = { + _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)' + _TESTS = [{ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'info_dict': { - 'id': 'cryptography', + 'id': 'x48c910b6', 'title': 'Cryptography', 'description': 'How have humans protected their secret messages through history? What has changed today?', + 'display_id': 'computing/computer-science/cryptography', + '_old_archive_ids': ['khanacademyunit cryptography'], }, 'playlist_mincount': 31, - } + }, { + 'url': 'https://www.khanacademy.org/computing/computer-science', + 'info_dict': { + 'id': 'x301707a0', + 'title': 'Computer science theory', + 'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba', + 'display_id': 'computing/computer-science', + '_old_archive_ids': ['khanacademyunit computer-science'], + }, + 'playlist_mincount': 50, + }] - def _parse_component_props(self, component_props): - curation = component_props['curation'] + def _parse_component_props(self, component_props, display_id): + course = component_props['course'] + selected_unit = traverse_obj(course, ( + 'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course - entries = [] - tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] - for tutorial_number, tutorial in enumerate(tutorials, 1): - chapter_info = { - 'chapter': tutorial.get('title'), - 'chapter_number': tutorial_number, - 'chapter_id': tutorial.get('id'), - } - for content_item in (tutorial.get('contentItems') or []): - if content_item.get('kind') == 'Video': - info = self._parse_video(content_item) - info.update(chapter_info) - entries.append(info) + def build_entry(entry): + return self.url_result(urljoin( + 'https://www.khanacademy.org', entry['canonicalUrl']), + KhanAcademyIE, title=entry.get('translatedTitle')) + + entries = traverse_obj(selected_unit, ( + (('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren', + lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry})) return self.playlist_result( - entries, curation.get('unit'), curation.get('title'), - curation.get('description')) + entries, + display_id=display_id, + **traverse_obj(selected_unit, { + 'id': ('id', {str}), + 'title': ('translatedTitle', {str}), + 'description': ('translatedDescription', {str}), + '_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}), + })) From ca8885edd93bdf8912af6c22ee335b6222cb9ba9 Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 3 Jun 2024 11:22:49 -0500 Subject: [PATCH 11/48] [fd/hls] Apply `extra_param_to_key_url` from info dict Authored by: bashonly --- yt_dlp/YoutubeDL.py | 5 +++-- yt_dlp/downloader/external.py | 2 +- yt_dlp/downloader/hls.py | 24 ++++++++++++++---------- yt_dlp/extractor/common.py | 9 ++++++++- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 5abcb4635c..7ed01bf840 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -581,8 +581,9 @@ class YoutubeDL: 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', - 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', - 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', + 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url', + 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', + 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', } _deprecated_multivalue_fields = { 'album_artist': 'album_artists', diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 8b45c671a0..63c1085699 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -108,7 +108,7 @@ def supports(cls, info_dict): return all(( not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, - not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'), + not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url', 'extra_param_to_key_url'), all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), )) diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 9cb4f014c0..0a00d5dabb 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -160,10 +160,12 @@ def is_ad_fragment_end(s): extra_state = ctx.setdefault('extra_state', {}) format_index = info_dict.get('format_index') - extra_query = None - extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') - if extra_param_to_segment_url: - extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + extra_segment_query = None + if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'): + extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url) + extra_key_query = None + if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'): + extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -190,8 +192,8 @@ def is_ad_fragment_end(s): if frag_index <= ctx['fragment_index']: continue frag_url = urljoin(man_url, line) - if extra_query: - frag_url = update_url_query(frag_url, extra_query) + if extra_segment_query: + frag_url = update_url_query(frag_url, extra_segment_query) fragments.append({ 'frag_index': frag_index, @@ -212,8 +214,8 @@ def is_ad_fragment_end(s): frag_index += 1 map_info = parse_m3u8_attributes(line[11:]) frag_url = urljoin(man_url, map_info.get('URI')) - if extra_query: - frag_url = update_url_query(frag_url, extra_query) + if extra_segment_query: + frag_url = update_url_query(frag_url, extra_segment_query) if map_info.get('BYTERANGE'): splitted_byte_range = map_info.get('BYTERANGE').split('@') @@ -244,8 +246,10 @@ def is_ad_fragment_end(s): decrypt_info['KEY'] = external_aes_key else: decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI']) - if extra_query: - decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) + if extra_key_query or extra_segment_query: + # Fall back to extra_segment_query to key for backwards compat + decrypt_info['URI'] = update_url_query( + decrypt_info['URI'], extra_key_query or extra_segment_query) if decrypt_url != decrypt_info['URI']: decrypt_info['KEY'] = None diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 2799747ece..e5efd08b4f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -234,7 +234,14 @@ class InfoExtractor: 'maybe' if the format may have DRM and has to be tested before download. * extra_param_to_segment_url A query string to append to each fragment's URL, or to update each existing query string - with. Only applied by the native HLS/DASH downloaders. + with. If it is an HLS stream with an AES-128 decryption key, + the query paramaters will be passed to the key URI as well, + unless there is an `extra_param_to_key_url` given, + or unless an external key URI is provided via `hls_aes`. + Only applied by the native HLS/DASH downloaders. + * extra_param_to_key_url A query string to append to the URL + of the format's HLS AES-128 decryption key. + Only applied by the native HLS downloader. * hls_aes A dictionary of HLS AES-128 decryption information used by the native HLS downloader to override the values in the media playlist when an '#EXT-X-KEY' tag From 5dbac313ae4e3e8521dfe2e1a6a048a98ff4b4fe Mon Sep 17 00:00:00 2001 From: bashonly Date: Sat, 15 Jun 2024 18:18:42 -0500 Subject: [PATCH 12/48] [ie/generic] Add `key_query` extractor-arg Authored by: bashonly --- README.md | 3 ++- yt_dlp/extractor/generic.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 42ffd9b520..ea7c671748 100644 --- a/README.md +++ b/README.md @@ -1779,8 +1779,9 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off #### generic -* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg +* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Note that if the stream has an HLS AES-128 key, then the query parameters will be passed to the key URI as well, unless the `key_query` extractor-arg is passed, or unless an external key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE` +* `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index cc17890e76..3b8e1e957c 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2167,7 +2167,15 @@ def _extra_manifest_info(self, info, manifest_url): urllib.parse.urlparse(fragment_query).query or fragment_query or urllib.parse.urlparse(manifest_url).query or None) - hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None + key_query = self._configuration_arg('key_query', [None], casesense=True)[0] + if key_query is not None: + info['extra_param_to_key_url'] = ( + urllib.parse.urlparse(key_query).query or key_query + or urllib.parse.urlparse(manifest_url).query or None) + + def hex_or_none(value): + return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), }) or None From d6c2c2bc84f1434255be5c73baeb17d893d2c0d4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 16 Jun 2024 19:01:46 -0500 Subject: [PATCH 13/48] [ie/sproutvideo] Add extractors (#10098) Closes #2933, Closes #8942 Authored by: bashonly, TheZ3ro Co-authored-by: thezero --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/patreon.py | 17 ++- yt_dlp/extractor/sproutvideo.py | 198 ++++++++++++++++++++++++++++++++ 3 files changed, 214 insertions(+), 5 deletions(-) create mode 100644 yt_dlp/extractor/sproutvideo.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0f599c9db7..c411efb5aa 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1928,6 +1928,10 @@ ) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE +from .sproutvideo import ( + SproutVideoIE, + VidsIoIE, +) from .srgssr import ( SRGSSRIE, SRGSSRPlayIE, diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 26ca84ab34..5dc46e3171 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -2,6 +2,7 @@ import urllib.parse from .common import InfoExtractor +from .sproutvideo import VidsIoIE from .vimeo import VimeoIE from ..networking.exceptions import HTTPError from ..utils import ( @@ -12,6 +13,7 @@ int_or_none, mimetype2ext, parse_iso8601, + smuggle_url, str_or_none, traverse_obj, url_or_none, @@ -305,22 +307,27 @@ def _real_extract(self, url): 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), })) + # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo + headers = {'referer': 'https://patreon.com/'} + # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': v_url = urllib.parse.unquote(self._html_search_regex( r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '') if url_or_none(v_url) and self._request_webpage( - v_url, video_id, 'Checking Vimeo embed URL', - headers={'Referer': 'https://patreon.com/'}, - fatal=False, errnote=False): + v_url, video_id, 'Checking Vimeo embed URL', headers=headers, fatal=False, errnote=False): entries.append(self.url_result( VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) - if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): - entries.append(self.url_result(embed_url)) + if embed_url and (urlh := self._request_webpage( + embed_url, video_id, 'Checking embed URL', headers=headers, + fatal=False, errnote=False, expected_status=403)): + # Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie + if urlh.status != 403 or VidsIoIE.suitable(embed_url): + entries.append(self.url_result(smuggle_url(embed_url, headers))) post_file = traverse_obj(attributes, ('post_file', {dict})) if post_file: diff --git a/yt_dlp/extractor/sproutvideo.py b/yt_dlp/extractor/sproutvideo.py new file mode 100644 index 0000000000..c0923594e5 --- /dev/null +++ b/yt_dlp/extractor/sproutvideo.py @@ -0,0 +1,198 @@ +import base64 +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + remove_start, + smuggle_url, + unsmuggle_url, + update_url_query, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class SproutVideoIE(InfoExtractor): + _NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P[\da-f]+)/[\da-f]+' + _VALID_URL = rf'https?:{_NO_SCHEME_RE}' + _EMBED_REGEX = [rf'