From 8e6e3651727b0b85764857fc6329fe5e0a3f00de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Finn=20R=2E=20G=C3=A4rtner?= <65015656+FinnRG@users.noreply.github.com> Date: Sun, 14 Jan 2024 19:28:03 +0100 Subject: [PATCH 01/61] [ie/Piapro] Improve `_VALID_URL` (#8999) Authored by: FinnRG --- yt_dlp/extractor/piapro.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index 5f39e06396..3ae985da2b 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -12,7 +12,7 @@ class PiaproIE(InfoExtractor): _NETRC_MACHINE = 'piapro' - _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P\w+)/?' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P[\w-]+)/?' _TESTS = [{ 'url': 'https://piapro.jp/t/NXYR', 'md5': 'f7c0f760913fb1d44a1c45a4af793909', @@ -49,6 +49,9 @@ class PiaproIE(InfoExtractor): }, { 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', 'only_matching': True + }, { + 'url': 'https://piapro.jp/t/-SO-', + 'only_matching': True }] _login_status = False From 014cb5774d7afe624b6eb4e07f7be924b9e5e186 Mon Sep 17 00:00:00 2001 From: Andrew Gibson Date: Thu, 18 Jan 2024 16:18:04 -0500 Subject: [PATCH 02/61] [ie/aenetworks] Rating should be optional for AP extraction (#9005) Authored by: agibson-fl --- yt_dlp/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index 63a0532ef1..ab4b6c0ebc 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -93,7 +93,7 @@ def _extract_aetn_info(self, domain, filter_key, filter_value, url): resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) + traverse_obj(theplatform_metadata, ('ratings', 0, 'rating'))) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) info.update(self._extract_aen_smil(media_url, video_id, auth)) From 4d9dc0abe24ad5d9d22a16f40fc61137dcd103f7 Mon Sep 17 00:00:00 2001 From: Bibhav48 <76898850+Bibhav48@users.noreply.github.com> Date: Fri, 19 Jan 2024 03:05:04 +0545 Subject: [PATCH 03/61] [ie/cloudflarestream] Extract subtitles (#9007) Closes #8830 Authored by: Bibhav48 --- yt_dlp/extractor/cloudflarestream.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index 748e8e9087..c4c7d66a5a 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -46,15 +46,18 @@ def _real_extract(self, url): video_id.split('.')[1] + '==='), video_id)['sub'] manifest_base_url = base_url + 'manifest/video.' - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( manifest_base_url + 'm3u8', video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( - manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'id': video_id, 'title': video_id, 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', 'formats': formats, + 'subtitles': subtitles, } From 393b487a4ea391c44e811505ec98531031d7e81e Mon Sep 17 00:00:00 2001 From: Nicolas Appriou Date: Fri, 19 Jan 2024 00:23:29 +0100 Subject: [PATCH 04/61] [ie/ArteTV] Separate closed captions (#8231) Authored by: Nicals, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/arte.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 139a3a729f..92b4900f96 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -70,7 +70,24 @@ class ArteTVIE(ArteTVBaseIE): 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530', 'upload_date': '20230930', 'ext': 'mp4', - } + }, + }, { + 'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/', + 'info_dict': { + 'id': '085374-003-A', + 'ext': 'mp4', + 'description': 'md5:ab79ec7cc472a93164415b4e4916abf9', + 'timestamp': 1702872000, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530', + 'duration': 2594, + 'title': 'Die kurze Zeit der Jugend', + 'alt_title': 'Im hohen Norden geboren', + 'upload_date': '20231218', + 'subtitles': { + 'fr': 'mincount:1', + 'fr-acc': 'mincount:1', + }, + }, }] _GEO_BYPASS = True @@ -121,6 +138,16 @@ class ArteTVIE(ArteTVBaseIE): ), } + @staticmethod + def _fix_accessible_subs_locale(subs): + updated_subs = {} + for lang, sub_formats in subs.items(): + for format in sub_formats: + if format.get('url', '').endswith('-MAL.m3u8'): + lang += '-acc' + updated_subs.setdefault(lang, []).append(format) + return updated_subs + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -174,6 +201,7 @@ def _real_extract(self, url): secondary_formats.extend(fmts) else: formats.extend(fmts) + subs = self._fix_accessible_subs_locale(subs) self._merge_subtitles(subs, target=subtitles) elif stream['protocol'] in ('HTTPS', 'RTMP'): From 5498729c59b03a9511c64552da3ba2f802166f8d Mon Sep 17 00:00:00 2001 From: jazz1611 Date: Fri, 19 Jan 2024 06:24:34 +0700 Subject: [PATCH 05/61] [ie/GoogleDrive] Fix source file extraction (#8990) Closes #8976 Authored by: jazz1611 --- yt_dlp/extractor/googledrive.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 2fdec20f66..06658dd479 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -19,9 +19,9 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:docs|drive)\.google\.com/ + (?:docs|drive|drive\.usercontent)\.google\.com/ (?: - (?:uc|open)\?.*?id=| + (?:uc|open|download)\?.*?id=| file/d/ )| video\.google\.com/get_player\?.*?docid= @@ -53,6 +53,9 @@ class GoogleDriveIE(InfoExtractor): }, { 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 'only_matching': True, + }, { + 'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'only_matching': True, }] _FORMATS_EXT = { '5': 'flv', @@ -205,9 +208,10 @@ def get_value(key): formats.append(f) source_url = update_url_query( - 'https://drive.google.com/uc', { + 'https://drive.usercontent.google.com/download', { 'id': video_id, 'export': 'download', + 'confirm': 't', }) def request_source_file(source_url, kind, data=None): From cf6413e840476c15e5b166dc2f7cc2a90a4a9aad Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 19 Jan 2024 08:27:25 +0900 Subject: [PATCH 06/61] [ie/BiliIntl] Fix and improve subtitles extraction (#7077) Closes #7075, Closes #6664 Authored by: HobbyistDev, itachi-19, dirkf, seproDev Co-authored-by: itachi-19 <16500619+itachi-19@users.noreply.github.com> Co-authored-by: dirkf Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/bilibili.py | 42 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index bc25dc75e2..5475b3650b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -18,6 +18,7 @@ OnDemandPagedList, bool_or_none, clean_html, + determine_ext, filter_dict, float_or_none, format_field, @@ -1658,19 +1659,34 @@ def _get_subtitles(self, *, ep_id=None, aid=None): 'aid': aid, })) or {} subtitles = {} - for sub in sub_json.get('subtitles') or []: - sub_url = sub.get('url') - if not sub_url: - continue - sub_data = self._download_json( - sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, - note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') - if not sub_data: - continue - subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ - 'ext': 'srt', - 'data': self.json2srt(sub_data) - }) + fetched_urls = set() + for sub in traverse_obj(sub_json, (('subtitles', 'video_subtitle'), ..., {dict})): + for url in traverse_obj(sub, ((None, 'ass', 'srt'), 'url', {url_or_none})): + if url in fetched_urls: + continue + fetched_urls.add(url) + sub_ext = determine_ext(url) + sub_lang = sub.get('lang_key') or 'en' + + if sub_ext == 'ass': + subtitles.setdefault(sub_lang, []).append({ + 'ext': 'ass', + 'url': url, + }) + elif sub_ext == 'json': + sub_data = self._download_json( + url, ep_id or aid, fatal=False, + note=f'Downloading subtitles{format_field(sub, "lang", " for %s")} ({sub_lang})', + errnote='Unable to download subtitles') + + if sub_data: + subtitles.setdefault(sub_lang, []).append({ + 'ext': 'srt', + 'data': self.json2srt(sub_data), + }) + else: + self.report_warning('Unexpected subtitle extension', ep_id or aid) + return subtitles def _get_formats(self, *, ep_id=None, aid=None): From cf9af2c7f1fedd881a157b3fbe725e5494b00924 Mon Sep 17 00:00:00 2001 From: Akmal <72781956+Wikidepia@users.noreply.github.com> Date: Fri, 19 Jan 2024 06:40:08 +0700 Subject: [PATCH 07/61] [ie/Facebook] Add new ID format (#3824) Closes #3496 Authored by: Wikidepia, kclauhk Co-authored-by: kclauhk <78251477+kclauhk@users.noreply.github.com> --- yt_dlp/extractor/facebook.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a07a0d344d..a16a067abb 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor): )| facebook: ) - (?P[0-9]+) + (?Ppfbid[A-Za-z0-9]+|\d+) ''' _EMBED_REGEX = [ r']+?src=(["\'])(?Phttps?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', @@ -247,6 +247,24 @@ class FacebookIE(InfoExtractor): 'thumbnail': r're:^https?://.*', 'duration': 148.435, }, + }, { + 'url': 'https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl', + 'info_dict': { + 'id': '6968553779868435', + 'ext': 'mp4', + 'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb', + 'uploader': 'ATTN:', + 'upload_date': '20231207', + 'title': 'ATTN:', + 'duration': 132.675, + 'uploader_id': '100064451419378', + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'timestamp': 1701975646, + }, + }, { + 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', + 'only_matching': True, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, From fee2d8d9c38f9b5f0a8df347c1e698983339c34d Mon Sep 17 00:00:00 2001 From: gmes78 Date: Thu, 18 Jan 2024 23:41:28 +0000 Subject: [PATCH 08/61] [ie/Rule34Video] Extract more metadata (#7416) Closes #7233 Authored by: gmes78 --- yt_dlp/extractor/rule34video.py | 77 +++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index f3250b557a..e6bb4258e9 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -1,7 +1,20 @@ import re -from ..utils import parse_duration, unescapeHTML from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_attribute, + get_element_by_class, + get_element_html_by_class, + get_elements_by_class, + int_or_none, + join_nonempty, + parse_count, + parse_duration, + unescapeHTML, +) +from ..utils.traversal import traverse_obj class Rule34VideoIE(InfoExtractor): @@ -17,7 +30,16 @@ class Rule34VideoIE(InfoExtractor): 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg', 'duration': 347.0, 'age_limit': 18, - 'tags': 'count:14' + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'timestamp': 1639872000, + 'description': 'https://discord.gg/aBqPrHSHvv', + 'upload_date': '20211219', + 'uploader': 'Sweet HMV', + 'uploader_url': 'https://rule34video.com/members/22119/', + 'categories': ['3D', 'MMD', 'iwara'], + 'tags': 'mincount:10' } }, { @@ -30,7 +52,17 @@ class Rule34VideoIE(InfoExtractor): 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg', 'duration': 938.0, 'age_limit': 18, - 'tags': 'count:50' + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'timestamp': 1640131200, + 'description': '', + 'creator': 'WildeerStudio', + 'upload_date': '20211222', + 'uploader': 'CerZule', + 'uploader_url': 'https://rule34video.com/members/36281/', + 'categories': ['3D', 'Tomb Raider'], + 'tags': 'mincount:40' } }, ] @@ -49,17 +81,44 @@ def _real_extract(self, url): 'quality': quality, }) - title = self._html_extract_title(webpage) - thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None) - duration = self._html_search_regex(r'"icon-clock">\s+((?:\d+:?)+)', webpage, 'duration', default=None) + categories, creator, uploader, uploader_url = [None] * 4 + for col in get_elements_by_class('col', webpage): + label = clean_html(get_element_by_class('label', col)) + if label == 'Categories:': + categories = list(map(clean_html, get_elements_by_class('item', col))) + elif label == 'Artist:': + creator = join_nonempty(*map(clean_html, get_elements_by_class('item', col)), delim=', ') + elif label == 'Uploaded By:': + uploader = clean_html(get_element_by_class('name', col)) + uploader_url = extract_attributes(get_element_html_by_class('name', col) or '').get('href') return { + **traverse_obj(self._search_json_ld(webpage, video_id, default={}), ({ + 'title': 'title', + 'view_count': 'view_count', + 'like_count': 'like_count', + 'duration': 'duration', + 'timestamp': 'timestamp', + 'description': 'description', + 'thumbnail': ('thumbnails', 0, 'url'), + })), 'id': video_id, 'formats': formats, - 'title': title, - 'thumbnail': thumbnail, - 'duration': parse_duration(duration), + 'title': self._html_extract_title(webpage), + 'thumbnail': self._html_search_regex( + r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None), + 'duration': parse_duration(self._html_search_regex( + r'"icon-clock">\s+((?:\d+:?)+)', webpage, 'duration', default=None)), + 'view_count': int_or_none(self._html_search_regex( + r'"icon-eye">\s+([ \d]+)', webpage, 'views', default='').replace(' ', '')), + 'like_count': parse_count(get_element_by_class('voters count', webpage)), + 'comment_count': int_or_none(self._search_regex( + r'[^(]+\((\d+)\)', get_element_by_attribute('href', '#tab_comments', webpage), 'comment count', fatal=False)), 'age_limit': 18, + 'creator': creator, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'categories': categories, 'tags': list(map(unescapeHTML, re.findall( r']+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P[^>]*)', webpage))), } From 5e2e24b2c5795756d81785b06b10723ddb6db7b2 Mon Sep 17 00:00:00 2001 From: Philipp Waldhauer Date: Fri, 19 Jan 2024 00:52:13 +0100 Subject: [PATCH 09/61] [ie/MagentaMusik] Add extractor (#7790) Authored by: pwaldhauer, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/magentamusik.py | 62 +++++++++++++++++++++++++++++ yt_dlp/extractor/magentamusik360.py | 58 --------------------------- 3 files changed, 63 insertions(+), 59 deletions(-) create mode 100644 yt_dlp/extractor/magentamusik.py delete mode 100644 yt_dlp/extractor/magentamusik360.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 557ff94470..b49e0366c0 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -996,7 +996,7 @@ ) from .maariv import MaarivIE from .magellantv import MagellanTVIE -from .magentamusik360 import MagentaMusik360IE +from .magentamusik import MagentaMusikIE from .mailru import ( MailRuIE, MailRuMusicIE, diff --git a/yt_dlp/extractor/magentamusik.py b/yt_dlp/extractor/magentamusik.py new file mode 100644 index 0000000000..9d86a1b21d --- /dev/null +++ b/yt_dlp/extractor/magentamusik.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, join_nonempty, url_or_none +from ..utils.traversal import traverse_obj + + +class MagentaMusikIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magentamusik\.de/(?P[^/?#]+)' + + _TESTS = [{ + 'url': 'https://www.magentamusik.de/marty-friedman-woa-2023-9208205928595409235', + 'md5': 'd82dd4748f55fc91957094546aaf8584', + 'info_dict': { + 'id': '9208205928595409235', + 'display_id': 'marty-friedman-woa-2023-9208205928595409235', + 'ext': 'mp4', + 'title': 'Marty Friedman: W:O:A 2023', + 'alt_title': 'Konzert vom: 05.08.2023 13:00', + 'duration': 2760, + 'categories': ['Musikkonzert'], + 'release_year': 2023, + 'location': 'Deutschland', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_config = self._search_json( + r'data-js-element="o-video-player__config">', webpage, 'player config', display_id, fatal=False) + if not player_config: + raise ExtractorError('No video found', expected=True) + + asset_id = player_config['assetId'] + asset_details = self._download_json( + f'https://wcps.t-online.de/cvss/magentamusic/vodclient/v2/assetdetails/58938/{asset_id}', + display_id, note='Downloading asset details') + + video_id = traverse_obj( + asset_details, ('content', 'partnerInformation', ..., 'reference', {str}), get_all=False) + if not video_id: + raise ExtractorError('Unable to extract video id') + + vod_data = self._download_json( + f'https://wcps.t-online.de/cvss/magentamusic/vodclient/v2/player/58935/{video_id}/Main%20Movie', video_id) + smil_url = traverse_obj( + vod_data, ('content', 'feature', 'representations', ..., + 'contentPackages', ..., 'media', 'href', {url_or_none}), get_all=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': self._extract_smil_formats(smil_url, video_id), + **traverse_obj(vod_data, ('content', 'feature', 'metadata', { + 'title': 'title', + 'alt_title': 'originalTitle', + 'description': 'longDescription', + 'duration': ('runtimeInSeconds', {int_or_none}), + 'location': ('countriesOfProduction', {list}, {lambda x: join_nonempty(*x, delim=', ')}), + 'release_year': ('yearOfProduction', {int_or_none}), + 'categories': ('mainGenre', {str}, {lambda x: x and [x]}), + })), + } diff --git a/yt_dlp/extractor/magentamusik360.py b/yt_dlp/extractor/magentamusik360.py deleted file mode 100644 index 5d0cb3bfb5..0000000000 --- a/yt_dlp/extractor/magentamusik360.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor - - -class MagentaMusik360IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?magenta-musik-360\.de/([a-z0-9-]+-(?P[0-9]+)|festivals/.+)' - _TESTS = [{ - 'url': 'https://www.magenta-musik-360.de/within-temptation-wacken-2019-1-9208205928595185932', - 'md5': '65b6f060b40d90276ec6fb9b992c1216', - 'info_dict': { - 'id': '9208205928595185932', - 'ext': 'm3u8', - 'title': 'WITHIN TEMPTATION', - 'description': 'Robert Westerholt und Sharon Janny den Adel gründeten die Symphonic Metal-Band. Privat sind die Niederländer ein Paar und haben zwei Kinder. Die Single Ice Queen brachte ihnen Platin und Gold und verhalf 2002 zum internationalen Durchbruch. Charakteristisch für die Band war Anfangs der hohe Gesang von Frontfrau Sharon. Stilistisch fing die Band im Gothic Metal an. Mit neuem Sound, schnellen Gitarrenriffs und Gitarrensoli, avancierte Within Temptation zur erfolgreichen Rockband. Auch dieses Jahr wird die Band ihre Fangemeinde wieder mitreißen.', - } - }, { - 'url': 'https://www.magenta-musik-360.de/festivals/wacken-world-wide-2020-body-count-feat-ice-t', - 'md5': '81010d27d7cab3f7da0b0f681b983b7e', - 'info_dict': { - 'id': '9208205928595231363', - 'ext': 'm3u8', - 'title': 'Body Count feat. Ice-T', - 'description': 'Body Count feat. Ice-T konnten bereits im vergangenen Jahr auf dem „Holy Ground“ in Wacken überzeugen. 2020 gehen die Crossover-Metaller aus einem Club in Los Angeles auf Sendung und bringen mit ihrer Mischung aus Metal und Hip-Hop Abwechslung und ordentlich Alarm zum WWW. Bereits seit 1990 stehen die beiden Gründer Ice-T (Gesang) und Ernie C (Gitarre) auf der Bühne. Sieben Studioalben hat die Gruppe bis jetzt veröffentlicht, darunter das Debüt „Body Count“ (1992) mit dem kontroversen Track „Cop Killer“.', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - # _match_id casts to string, but since "None" is not a valid video_id for magenta - # there is no risk for confusion - if video_id == "None": - webpage = self._download_webpage(url, video_id) - video_id = self._html_search_regex(r'data-asset-id="([^"]+)"', webpage, 'video_id') - json = self._download_json("https://wcps.t-online.de/cvss/magentamusic/vodplayer/v3/player/58935/%s/Main%%20Movie" % video_id, video_id) - xml_url = json['content']['feature']['representations'][0]['contentPackages'][0]['media']['href'] - metadata = json['content']['feature'].get('metadata') - title = None - description = None - duration = None - thumbnails = [] - if metadata: - title = metadata.get('title') - description = metadata.get('fullDescription') - duration = metadata.get('runtimeInSeconds') - for img_key in ('teaserImageWide', 'smallCoverImage'): - if img_key in metadata: - thumbnails.append({'url': metadata[img_key].get('href')}) - - xml = self._download_xml(xml_url, video_id) - final_url = xml[0][0][0].attrib['src'] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'url': final_url, - 'duration': duration, - 'thumbnails': thumbnails - } From aa5dcc4ee65916a36cbe1b1b5b29b9110c3163ed Mon Sep 17 00:00:00 2001 From: Giulio Muscarello Date: Fri, 19 Jan 2024 02:51:53 +0000 Subject: [PATCH 10/61] [ie/IlPost] Add extractor (#9001) Authored by: CapacitorSet --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ilpost.py | 69 +++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 yt_dlp/extractor/ilpost.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b49e0366c0..5fc39d111b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -787,6 +787,7 @@ IHeartRadioIE, IHeartRadioPodcastIE, ) +from .ilpost import IlPostIE from .iltalehti import IltalehtiIE from .imdb import ( ImdbIE, diff --git a/yt_dlp/extractor/ilpost.py b/yt_dlp/extractor/ilpost.py new file mode 100644 index 0000000000..ae98399ee5 --- /dev/null +++ b/yt_dlp/extractor/ilpost.py @@ -0,0 +1,69 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class IlPostIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ilpost\.it/episodes/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.ilpost.it/episodes/1-avis-akvasas-ka/', + 'md5': '43649f002d85e1c2f319bb478d479c40', + 'info_dict': { + 'id': '2972047', + 'ext': 'mp3', + 'display_id': '1-avis-akvasas-ka', + 'title': '1. Avis akvasas ka', + 'url': 'https://www.ilpost.it/wp-content/uploads/2023/12/28/1703781217-l-invasione-pt1-v6.mp3', + 'timestamp': 1703835014, + 'upload_date': '20231229', + 'duration': 2495.0, + 'availability': 'public', + 'series_id': '235598', + 'description': '', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + endpoint_metadata = self._search_json( + r'var\s+ilpostpodcast\s*=', webpage, 'metadata', display_id) + episode_id = endpoint_metadata['post_id'] + podcast_id = endpoint_metadata['podcast_id'] + podcast_metadata = self._download_json( + endpoint_metadata['ajax_url'], display_id, data=urlencode_postdata({ + 'action': 'checkpodcast', + 'cookie': endpoint_metadata['cookie'], + 'post_id': episode_id, + 'podcast_id': podcast_id, + })) + + episode = traverse_obj(podcast_metadata, ( + 'data', 'postcastList', lambda _, v: str(v['id']) == episode_id, {dict}), get_all=False) + if not episode: + raise ExtractorError('Episode could not be extracted') + + return { + 'id': episode_id, + 'display_id': display_id, + 'series_id': podcast_id, + 'vcodec': 'none', + **traverse_obj(episode, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'url': ('podcast_raw_url', {url_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('timestamp', {int_or_none}), + 'duration': ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + 'availability': ('free', {lambda v: 'public' if v else 'subscriber_only'}), + }), + } From 6171b050d70435008e64fa06aa6f19c4e5bec75f Mon Sep 17 00:00:00 2001 From: Karavellas <149634176+pompos02@users.noreply.github.com> Date: Fri, 19 Jan 2024 05:00:49 +0200 Subject: [PATCH 11/61] [ie/ElementorEmbed] Add extractor (#8948) Authored by: pompos02, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/elementorembed.py | 72 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 yt_dlp/extractor/elementorembed.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 5fc39d111b..7250ad5e07 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -540,6 +540,7 @@ from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE +from .elementorembed import ElementorEmbedIE from .elonet import ElonetIE from .elpais import ElPaisIE from .eltrecetv import ElTreceTVIE diff --git a/yt_dlp/extractor/elementorembed.py b/yt_dlp/extractor/elementorembed.py new file mode 100644 index 0000000000..638893f6f6 --- /dev/null +++ b/yt_dlp/extractor/elementorembed.py @@ -0,0 +1,72 @@ +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from .youtube import YoutubeIE +from ..utils import unescapeHTML, url_or_none +from ..utils.traversal import traverse_obj + + +class ElementorEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + 'url': 'https://capitaltv.cy/2023/12/14/υγεια-και-ζωη-14-12-2023-δρ-ξενια-κωσταντινιδο/', + 'info_dict': { + 'id': 'KgzuxwuQwM4', + 'ext': 'mp4', + 'title': 'ΥΓΕΙΑ ΚΑΙ ΖΩΗ 14 12 2023 ΔΡ ΞΕΝΙΑ ΚΩΣΤΑΝΤΙΝΙΔΟΥ', + 'thumbnail': 'https://i.ytimg.com/vi/KgzuxwuQwM4/maxresdefault.jpg', + 'playable_in_embed': True, + 'tags': 'count:16', + 'like_count': int, + 'channel': 'Capital TV Cyprus', + 'channel_id': 'UCR8LwVKTLGEXt4ZAErpCMrg', + 'availability': 'public', + 'description': 'md5:7a3308a22881aea4612358c4ba121f77', + 'duration': 2891, + 'upload_date': '20231214', + 'uploader_id': '@capitaltvcyprus6389', + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCR8LwVKTLGEXt4ZAErpCMrg', + 'uploader_url': 'https://www.youtube.com/@capitaltvcyprus6389', + 'uploader': 'Capital TV Cyprus', + 'age_limit': 0, + 'categories': ['News & Politics'], + 'view_count': int, + 'channel_follower_count': int, + }, + }, { + 'url': 'https://elementor.com/academy/theme-builder-collection/?playlist=76011151&video=9e59909', + 'info_dict': { + 'id': '?playlist=76011151&video=9e59909', + 'title': 'Theme Builder Collection - Academy', + 'age_limit': 0, + 'timestamp': 1702196984.0, + 'upload_date': '20231210', + 'description': 'md5:7f52c52715ee9e54fd7f82210511673d', + 'thumbnail': 'https://elementor.com/academy/wp-content/uploads/2021/07/Theme-Builder-1.png', + }, + 'playlist_count': 11, + 'params': { + 'skip_download': True, + }, + }] + _WIDGET_REGEX = r']+class="[^"]*elementor-widget-video(?:-playlist)?[^"]*"[^>]*data-settings="([^"]*)"' + + def _extract_from_webpage(self, url, webpage): + for data_settings in re.findall(self._WIDGET_REGEX, webpage): + data = self._parse_json(data_settings, None, fatal=False, transform_source=unescapeHTML) + if youtube_url := traverse_obj(data, ('youtube_url', {url_or_none})): + yield self.url_result(youtube_url, ie=YoutubeIE) + + for video in traverse_obj(data, ('tabs', lambda _, v: v['_id'], {dict})): + if youtube_url := traverse_obj(video, ('youtube_url', {url_or_none})): + yield self.url_result(youtube_url, ie=YoutubeIE) + if vimeo_url := traverse_obj(video, ('vimeo_url', {url_or_none})): + yield self.url_result(vimeo_url, ie=VimeoIE) + for direct_url in traverse_obj(video, (('hosted_url', 'external_url'), 'url', {url_or_none})): + yield { + 'id': video['_id'], + 'url': direct_url, + 'title': video.get('title'), + } From ba6b0c8261e9f0a6373885736ff90a89dd1fb614 Mon Sep 17 00:00:00 2001 From: DmitryScaletta Date: Fri, 19 Jan 2024 06:16:21 +0300 Subject: [PATCH 12/61] [ie/chzzk] Add extractors (#8887) Closes #8804 Authored by: DmitryScaletta --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/chzzk.py | 139 ++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 yt_dlp/extractor/chzzk.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7250ad5e07..3d360a52f6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -345,6 +345,10 @@ ChingariIE, ChingariUserIE, ) +from .chzzk import ( + CHZZKLiveIE, + CHZZKVideoIE, +) from .cinemax import CinemaxIE from .cinetecamilano import CinetecaMilanoIE from .cineverse import ( diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py new file mode 100644 index 0000000000..6894baea5c --- /dev/null +++ b/yt_dlp/extractor/chzzk.py @@ -0,0 +1,139 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class CHZZKLiveIE(InfoExtractor): + IE_NAME = 'chzzk:live' + _VALID_URL = r'https?://chzzk\.naver\.com/live/(?P[\da-f]+)' + _TESTS = [{ + 'url': 'https://chzzk.naver.com/live/c68b8ef525fb3d2fa146344d84991753', + 'info_dict': { + 'id': 'c68b8ef525fb3d2fa146344d84991753', + 'ext': 'mp4', + 'title': str, + 'channel': '진짜도현', + 'channel_id': 'c68b8ef525fb3d2fa146344d84991753', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1705510344, + 'upload_date': '20240117', + 'live_status': 'is_live', + 'view_count': int, + 'concurrent_view_count': int, + }, + 'skip': 'The channel is not currently live', + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + live_detail = self._download_json( + f'https://api.chzzk.naver.com/service/v2/channels/{channel_id}/live-detail', channel_id, + note='Downloading channel info', errnote='Unable to download channel info')['content'] + + if live_detail.get('status') == 'CLOSE': + raise ExtractorError('The channel is not currently live', expected=True) + + live_playback = self._parse_json(live_detail['livePlaybackJson'], channel_id) + + thumbnails = [] + thumbnail_template = traverse_obj( + live_playback, ('thumbnail', 'snapshotThumbnailTemplate', {url_or_none})) + if thumbnail_template and '{type}' in thumbnail_template: + for width in traverse_obj(live_playback, ('thumbnail', 'types', ..., {str})): + thumbnails.append({ + 'id': width, + 'url': thumbnail_template.replace('{type}', width), + 'width': int_or_none(width), + }) + + formats, subtitles = [], {} + for media in traverse_obj(live_playback, ('media', lambda _, v: url_or_none(v['path']))): + is_low_latency = media.get('mediaId') == 'LLHLS' + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media['path'], channel_id, 'mp4', fatal=False, live=True, + m3u8_id='hls-ll' if is_low_latency else 'hls') + for f in fmts: + if is_low_latency: + f['source_preference'] = -2 + if '-afragalow.stream-audio.stream' in f['format_id']: + f['quality'] = -2 + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': channel_id, + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + **traverse_obj(live_detail, { + 'title': ('liveTitle', {str}), + 'timestamp': ('openDate', {functools.partial(parse_iso8601, delimiter=' ')}), + 'concurrent_view_count': ('concurrentUserCount', {int_or_none}), + 'view_count': ('accumulateCount', {int_or_none}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelId', {str}), + 'channel_is_verified': ('channel', 'verifiedMark', {bool}), + }), + } + + +class CHZZKVideoIE(InfoExtractor): + IE_NAME = 'chzzk:video' + _VALID_URL = r'https?://chzzk\.naver\.com/video/(?P\d+)' + _TESTS = [{ + 'url': 'https://chzzk.naver.com/video/1754', + 'md5': 'b0c0c1bb888d913b93d702b1512c7f06', + 'info_dict': { + 'id': '1754', + 'ext': 'mp4', + 'title': '치지직 테스트 방송', + 'channel': '침착맨', + 'channel_id': 'bb382c2c0cc9fa7c86ab3b037fb5799c', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 15577, + 'timestamp': 1702970505.417, + 'upload_date': '20231219', + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_meta = self._download_json( + f'https://api.chzzk.naver.com/service/v2/videos/{video_id}', video_id, + note='Downloading video info', errnote='Unable to download video info')['content'] + formats, subtitles = self._extract_mpd_formats_and_subtitles( + f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id, + query={ + 'key': video_meta['inKey'], + 'env': 'real', + 'lc': 'en_US', + 'cpl': 'en_US', + }, note='Downloading video playback', errnote='Unable to download video playback') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(video_meta, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'timestamp': ('publishDateAt', {functools.partial(float_or_none, scale=1000)}), + 'view_count': ('readCount', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelId', {str}), + 'channel_is_verified': ('channel', 'verifiedMark', {bool}), + }), + } From a281beba8d8f007cf220f96dd1d9412bb070c7d8 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Fri, 19 Jan 2024 05:41:10 +0100 Subject: [PATCH 13/61] [ie/naver] Fix extractors (#8883) Closes #8850, Closes #8692 Authored by: seproDev --- yt_dlp/extractor/naver.py | 173 ++++++++++++++++++++------------------ 1 file changed, 90 insertions(+), 83 deletions(-) diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 2d8459b02b..806b79082c 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -1,20 +1,25 @@ +import base64 +import hashlib +import hmac import itertools +import json import re -from urllib.parse import urlparse, parse_qs +import time +from urllib.parse import parse_qs, urlparse from .common import InfoExtractor from ..utils import ( ExtractorError, - clean_html, dict_get, int_or_none, join_nonempty, merge_dicts, - parse_duration, + parse_iso8601, traverse_obj, try_get, unified_timestamp, update_url_query, + url_or_none, ) @@ -110,6 +115,18 @@ def get_subs(caption_url): **self.process_subtitles(video_data, get_subs), } + def _call_api(self, path, video_id): + api_endpoint = f'https://apis.naver.com/now_web2/now_web_api/v1{path}' + key = b'nbxvs5nwNG9QKEWK0ADjYA4JZoujF4gHcIwvoCxFTPAeamq5eemvt5IWAYXxrbYM' + msgpad = int(time.time() * 1000) + md = base64.b64encode(hmac.HMAC( + key, f'{api_endpoint[:255]}{msgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + + return self._download_json(api_endpoint, video_id=video_id, headers=self.geo_verification_headers(), query={ + 'msgpad': msgpad, + 'md': md, + })['result'] + class NaverIE(NaverBaseIE): _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P\d+)' @@ -125,21 +142,32 @@ class NaverIE(NaverBaseIE): 'upload_date': '20130903', 'uploader': '메가스터디, 합격불변의 법칙', 'uploader_id': 'megastudy', + 'uploader_url': 'https://tv.naver.com/megastudy', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'duration': 2118, + 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://tv.naver.com/v/395837', - 'md5': '8a38e35354d26a17f73f4e90094febd3', + 'md5': '7791205fa89dbed2f5e3eb16d287ff05', 'info_dict': { 'id': '395837', 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', - 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', + 'description': 'md5:c76be23e21403a6473d8119678cdb5cb', 'timestamp': 1432030253, 'upload_date': '20150519', - 'uploader': '4가지쇼 시즌2', - 'uploader_id': 'wrappinguser29', + 'uploader': '4가지쇼', + 'uploader_id': '4show', + 'uploader_url': 'https://tv.naver.com/4show', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'duration': 277, + 'thumbnail': r're:^https?://.*\.jpg', }, - 'skip': 'Georestricted', }, { 'url': 'http://tvcast.naver.com/v/81652', 'only_matching': True, @@ -147,56 +175,63 @@ class NaverIE(NaverBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - 'https://tv.naver.com/api/json/v/' + video_id, - video_id, headers=self.geo_verification_headers()) - player_info_json = content.get('playerInfoJson') or {} - current_clip = player_info_json.get('currentClip') or {} + data = self._call_api(f'/clips/{video_id}/play-info', video_id) - vid = current_clip.get('videoId') - in_key = current_clip.get('inKey') + vid = traverse_obj(data, ('clip', 'videoId', {str})) + in_key = traverse_obj(data, ('play', 'inKey', {str})) if not vid or not in_key: - player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) - if player_auth == 'notCountry': - self.raise_geo_restricted(countries=['KR']) - elif player_auth == 'notLogin': - self.raise_login_required() - raise ExtractorError('couldn\'t extract vid and key') + raise ExtractorError('Unable to extract video info') + info = self._extract_video_info(video_id, vid, in_key) - info.update({ - 'description': clean_html(current_clip.get('description')), - 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), - 'duration': parse_duration(current_clip.get('displayPlayTime')), - 'like_count': int_or_none(current_clip.get('recommendPoint')), - 'age_limit': 19 if current_clip.get('adult') else None, - }) + info.update(traverse_obj(data, ('clip', { + 'title': 'title', + 'description': 'description', + 'timestamp': ('firstExposureDatetime', {parse_iso8601}), + 'duration': ('playTime', {int_or_none}), + 'like_count': ('likeItCount', {int_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'uploader': 'channelName', + 'uploader_id': 'channelId', + 'uploader_url': ('channelUrl', {url_or_none}), + 'age_limit': ('adultVideo', {lambda x: 19 if x else None}), + }))) return info -class NaverLiveIE(InfoExtractor): +class NaverLiveIE(NaverBaseIE): IE_NAME = 'Naver:live' _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/l/(?P\d+)' _GEO_BYPASS = False _TESTS = [{ - 'url': 'https://tv.naver.com/l/52010', + 'url': 'https://tv.naver.com/l/127062', 'info_dict': { - 'id': '52010', + 'id': '127062', 'ext': 'mp4', - 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', - 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', - 'channel_id': 'NTV-ytnnews24-0', - 'start_time': 1597026780000, + 'live_status': 'is_live', + 'channel': '뉴스는 YTN', + 'channel_id': 'ytnnews24', + 'title': 're:^대한민국 24시간 뉴스 채널 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:f938b5956711beab6f882314ffadf4d5', + 'start_time': 1677752280, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'like_count': int, }, }, { - 'url': 'https://tv.naver.com/l/51549', + 'url': 'https://tv.naver.com/l/140535', 'info_dict': { - 'id': '51549', + 'id': '140535', 'ext': 'mp4', - 'title': '연합뉴스TV - 코로나19 뉴스특보', - 'description': 'md5:c655e82091bc21e413f549c0eaccc481', - 'channel_id': 'NTV-yonhapnewstv-0', - 'start_time': 1596406380000, + 'live_status': 'is_live', + 'channel': 'KBS뉴스', + 'channel_id': 'kbsnews', + 'start_time': 1696867320, + 'title': 're:^언제 어디서나! KBS 뉴스 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6ad419c0bf2f332829bda3f79c295284', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'like_count': int, }, }, { 'url': 'https://tv.naver.com/l/54887', @@ -205,55 +240,27 @@ class NaverLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id, 'Downloading Page', 'Unable to download Page') - secure_url = self._search_regex(r'sApiF:\s+(?:"|\')([^"\']+)', page, 'secureurl') - - info = self._extract_video_info(video_id, secure_url) - info.update({ - 'description': self._og_search_description(page) - }) - - return info - - def _extract_video_info(self, video_id, url): - video_data = self._download_json(url, video_id, headers=self.geo_verification_headers()) - meta = video_data.get('meta') - status = meta.get('status') + data = self._call_api(f'/live-end/normal/{video_id}/play-info?renewLastPlayDate=true', video_id) + status = traverse_obj(data, ('live', 'liveStatus')) if status == 'CLOSED': raise ExtractorError('Stream is offline.', expected=True) elif status != 'OPENED': - raise ExtractorError('Unknown status %s' % status) - - title = meta.get('title') - stream_list = video_data.get('streams') - - if stream_list is None: - raise ExtractorError('Could not get stream data.', expected=True) - - formats = [] - for quality in stream_list: - if not quality.get('url'): - continue - - prop = quality.get('property') - if prop.get('abr'): # This abr doesn't mean Average audio bitrate. - continue - - formats.extend(self._extract_m3u8_formats( - quality.get('url'), video_id, 'mp4', - m3u8_id=quality.get('qualityId'), live=True - )) + raise ExtractorError(f'Unknown status {status!r}') return { 'id': video_id, - 'title': title, - 'formats': formats, - 'channel_id': meta.get('channelId'), - 'channel_url': meta.get('channelUrl'), - 'thumbnail': meta.get('imgUrl'), - 'start_time': meta.get('startTime'), - 'categories': [meta.get('categoryId')], + 'formats': self._extract_m3u8_formats( + traverse_obj(data, ('playbackBody', {json.loads}, 'media', 0, 'path')), video_id, live=True), + **traverse_obj(data, ('live', { + 'title': 'title', + 'channel': 'channelName', + 'channel_id': 'channelId', + 'description': 'description', + 'like_count': (('likeCount', 'likeItCount'), {int_or_none}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'start_time': (('startTime', 'startDateTime', 'startYmdt'), {parse_iso8601}), + }), get_all=False), 'is_live': True } From c51316f8a69fbd0080f2720777d42ab438e254a3 Mon Sep 17 00:00:00 2001 From: sefidel Date: Fri, 19 Jan 2024 18:43:13 +0900 Subject: [PATCH 14/61] [ie/abematv] Fix extraction with cache (#8895) Closes #6532 Authored by: sefidel --- yt_dlp/extractor/abematv.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 57ccb928be..0a610e3151 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -136,11 +136,15 @@ def _get_device_token(self): if self._USERTOKEN: return self._USERTOKEN + add_opener(self._downloader, AbemaLicenseHandler(self)) + username, _ = self._get_login_info() - AbemaTVBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username) + auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') + AbemaTVBaseIE._USERTOKEN = auth_cache and auth_cache.get('usertoken') if AbemaTVBaseIE._USERTOKEN: # try authentication with locally stored token try: + AbemaTVBaseIE._DEVICE_ID = auth_cache.get('device_id') self._get_media_token(True) return except ExtractorError as e: @@ -159,7 +163,6 @@ def _get_device_token(self): }) AbemaTVBaseIE._USERTOKEN = user_data['token'] - add_opener(self._downloader, AbemaLicenseHandler(self)) return self._USERTOKEN def _get_media_token(self, invalidate=False, to_show=True): @@ -255,7 +258,7 @@ class AbemaTVIE(AbemaTVBaseIE): def _perform_login(self, username, password): self._get_device_token() - if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token(): + if self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') and self._get_media_token(): self.write_debug('Skipping logging in') return @@ -278,7 +281,11 @@ def _perform_login(self, username, password): AbemaTVBaseIE._USERTOKEN = login_response['token'] self._get_media_token(True) - self.cache.store(self._NETRC_MACHINE, username, AbemaTVBaseIE._USERTOKEN) + auth_cache = { + 'device_id': AbemaTVBaseIE._DEVICE_ID, + 'usertoken': AbemaTVBaseIE._USERTOKEN, + } + self.cache.store(self._NETRC_MACHINE, username, auth_cache) def _real_extract(self, url): # starting download using infojson from this extractor is undefined behavior, From 8226a3818f804478c756cf460baa9bf3a3b062a5 Mon Sep 17 00:00:00 2001 From: sefidel Date: Fri, 19 Jan 2024 18:50:16 +0900 Subject: [PATCH 15/61] [ie/abematv] Support login for playlists (#8901) Authored by: sefidel --- yt_dlp/extractor/abematv.py | 65 +++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 0a610e3151..6453dde973 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -92,6 +92,8 @@ def abematv_license_open(self, url): class AbemaTVBaseIE(InfoExtractor): + _NETRC_MACHINE = 'abematv' + _USERTOKEN = None _DEVICE_ID = None _MEDIATOKEN = None @@ -184,6 +186,37 @@ def _get_media_token(self, invalidate=False, to_show=True): return self._MEDIATOKEN + def _perform_login(self, username, password): + self._get_device_token() + if self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') and self._get_media_token(): + self.write_debug('Skipping logging in') + return + + if '@' in username: # don't strictly check if it's email address or not + ep, method = 'user/email', 'email' + else: + ep, method = 'oneTimePassword', 'userId' + + login_response = self._download_json( + f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in', + data=json.dumps({ + method: username, + 'password': password + }).encode('utf-8'), headers={ + 'Authorization': f'bearer {self._get_device_token()}', + 'Origin': 'https://abema.tv', + 'Referer': 'https://abema.tv/', + 'Content-Type': 'application/json', + }) + + AbemaTVBaseIE._USERTOKEN = login_response['token'] + self._get_media_token(True) + auth_cache = { + 'device_id': AbemaTVBaseIE._DEVICE_ID, + 'usertoken': AbemaTVBaseIE._USERTOKEN, + } + self.cache.store(self._NETRC_MACHINE, username, auth_cache) + def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'): return self._download_json( f'https://api.abema.io/{endpoint}', video_id, query=query or {}, @@ -207,7 +240,6 @@ def _extract_breadcrumb_list(self, webpage, video_id): class AbemaTVIE(AbemaTVBaseIE): _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' - _NETRC_MACHINE = 'abematv' _TESTS = [{ 'url': 'https://abema.tv/video/episode/194-25_s2_p1', 'info_dict': { @@ -256,37 +288,6 @@ class AbemaTVIE(AbemaTVBaseIE): }] _TIMETABLE = None - def _perform_login(self, username, password): - self._get_device_token() - if self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19') and self._get_media_token(): - self.write_debug('Skipping logging in') - return - - if '@' in username: # don't strictly check if it's email address or not - ep, method = 'user/email', 'email' - else: - ep, method = 'oneTimePassword', 'userId' - - login_response = self._download_json( - f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in', - data=json.dumps({ - method: username, - 'password': password - }).encode('utf-8'), headers={ - 'Authorization': f'bearer {self._get_device_token()}', - 'Origin': 'https://abema.tv', - 'Referer': 'https://abema.tv/', - 'Content-Type': 'application/json', - }) - - AbemaTVBaseIE._USERTOKEN = login_response['token'] - self._get_media_token(True) - auth_cache = { - 'device_id': AbemaTVBaseIE._DEVICE_ID, - 'usertoken': AbemaTVBaseIE._USERTOKEN, - } - self.cache.store(self._NETRC_MACHINE, username, auth_cache) - def _real_extract(self, url): # starting download using infojson from this extractor is undefined behavior, # and never be fixed in the future; you must trigger downloads by directly specifying URL. From 43694ce13c5a9f1afca8b02b8b2b9b1576d6503d Mon Sep 17 00:00:00 2001 From: SirElderling <148036781+SirElderling@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:19:09 +0000 Subject: [PATCH 16/61] [ie/NineNews] Add extractor (#8840) Closes #8831 Authored by: SirElderling --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ninenews.py | 72 +++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 yt_dlp/extractor/ninenews.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3d360a52f6..abba5bfa2f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1269,6 +1269,7 @@ NiconicoChannelPlusChannelLivesIE, ) from .ninegag import NineGagIE +from .ninenews import NineNewsIE from .ninenow import NineNowIE from .nintendo import NintendoIE from .nitter import NitterIE diff --git a/yt_dlp/extractor/ninenews.py b/yt_dlp/extractor/ninenews.py new file mode 100644 index 0000000000..900d9ba60f --- /dev/null +++ b/yt_dlp/extractor/ninenews.py @@ -0,0 +1,72 @@ +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..utils import ExtractorError +from ..utils.traversal import traverse_obj + + +class NineNewsIE(InfoExtractor): + IE_NAME = '9News' + _VALID_URL = r'https?://(?:www\.)?9news\.com\.au/(?:[\w-]+/){2,3}(?P[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.9news.com.au/videos/national/fair-trading-pulls-dozens-of-toys-from-shelves/clqgc7dvj000y0jnvfism0w5m', + 'md5': 'd1a65b2e9d126e5feb9bc5cb96e62c80', + 'info_dict': { + 'id': '6343717246112', + 'ext': 'mp4', + 'title': 'Fair Trading pulls dozens of toys from shelves', + 'description': 'Fair Trading Australia have been forced to pull dozens of toys from shelves over hazard fears.', + 'thumbnail': 'md5:bdbe44294e2323b762d97acf8843f66c', + 'duration': 93.44, + 'timestamp': 1703231748, + 'upload_date': '20231222', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'christmas presents', 'toys', 'fair trading', 'au_news'], + } + }, { + 'url': 'https://www.9news.com.au/world/tape-reveals-donald-trump-pressured-michigan-officials-not-to-certify-2020-vote-a-new-report-says/0b8b880e-7d3c-41b9-b2bd-55bc7e492259', + 'md5': 'a885c44d20898c3e70e9a53e8188cea1', + 'info_dict': { + 'id': '6343587450112', + 'ext': 'mp4', + 'title': 'Trump found ineligible to run for president by state court', + 'description': 'md5:40e6e7db7a4ac6be0e960569a5af6066', + 'thumbnail': 'md5:3e132c48c186039fd06c10787de9bff2', + 'duration': 104.64, + 'timestamp': 1703058034, + 'upload_date': '20231220', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'ineligible', 'presidential candidate', 'donald trump', 'au_news'], + } + }, { + 'url': 'https://www.9news.com.au/national/outrage-as-parents-banned-from-giving-gifts-to-kindergarten-teachers/e19b49d4-a1a4-4533-9089-6e10e2d9386a', + 'info_dict': { + 'id': '6343716797112', + 'ext': 'mp4', + 'title': 'Outrage as parents banned from giving gifts to kindergarten teachers', + 'description': 'md5:7a8b0ed2f9e08875fd9a3e86e462bc46', + 'thumbnail': 'md5:5ee4d66717bdd0dee9fc9a705ef041b8', + 'duration': 91.307, + 'timestamp': 1703229584, + 'upload_date': '20231222', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'presents', 'teachers', 'kindergarten', 'au_news'], + }, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + initial_state = self._search_json( + r'var\s+__INITIAL_STATE__\s*=', webpage, 'initial state', article_id) + video_id = traverse_obj( + initial_state, ('videoIndex', 'currentVideo', 'brightcoveId', {str}), + ('article', ..., 'media', lambda _, v: v['type'] == 'video', 'urn', {str}), get_all=False) + account = traverse_obj(initial_state, ( + 'videoIndex', 'config', (None, 'video'), 'account', {str}), get_all=False) + + if not video_id or not account: + raise ExtractorError('Unable to get the required video data') + + return self.url_result( + f'https://players.brightcove.net/{account}/default_default/index.html?videoId={video_id}', + BrightcoveNewIE, video_id) From 20cdad5a2c0499d5a6746f5466a2ab0c97b75884 Mon Sep 17 00:00:00 2001 From: DmitryScaletta Date: Fri, 19 Jan 2024 18:21:25 +0300 Subject: [PATCH 17/61] [ie/KukuluLive] Add extractor (#8877) Closes #8865 Authored by: DmitryScaletta --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/kukululive.py | 140 ++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 yt_dlp/extractor/kukululive.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index abba5bfa2f..aacb08fb67 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -905,6 +905,7 @@ from .kth import KTHIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kukululive import KukuluLiveIE from .kusi import KUSIIE from .kuwo import ( KuwoIE, diff --git a/yt_dlp/extractor/kukululive.py b/yt_dlp/extractor/kukululive.py new file mode 100644 index 0000000000..86ab5d40ec --- /dev/null +++ b/yt_dlp/extractor/kukululive.py @@ -0,0 +1,140 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + filter_dict, + get_element_by_id, + int_or_none, + join_nonempty, + js_to_json, + qualities, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class KukuluLiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.erinn\.biz/live\.php\?h(?P\d+)' + _TESTS = [{ + 'url': 'https://live.erinn.biz/live.php?h675134569', + 'md5': 'e380fa6a47fc703d91cea913ab44ec2e', + 'info_dict': { + 'id': '675134569', + 'ext': 'mp4', + 'title': 'プロセカ', + 'description': 'テストも兼ねたプロセカ配信。', + 'timestamp': 1702689148, + 'upload_date': '20231216', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://live.erinn.biz/live.php?h102338092', + 'md5': 'dcf5167a934b1c60333461e13a81a6e2', + 'info_dict': { + 'id': '102338092', + 'ext': 'mp4', + 'title': 'Among Usで遊びます!!', + 'description': 'VTuberになりましたねんねこ㌨ですよろしくお願いします', + 'timestamp': 1704603118, + 'upload_date': '20240107', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://live.erinn.biz/live.php?h878049531', + 'only_matching': True, + }] + + def _get_quality_meta(self, video_id, desc, code, force_h264=None): + desc += ' (force_h264)' if force_h264 else '' + qs = self._download_webpage( + 'https://live.erinn.biz/live.player.fplayer.php', video_id, + f'Downloading {desc} quality metadata', f'Unable to download {desc} quality metadata', + query=filter_dict({ + 'hash': video_id, + 'action': f'get{code}liveByAjax', + 'force_h264': force_h264, + })) + return urllib.parse.parse_qs(qs) + + def _add_quality_formats(self, formats, quality_meta): + vcodec = traverse_obj(quality_meta, ('vcodec', 0, {str})) + quality = traverse_obj(quality_meta, ('now_quality', 0, {str})) + quality_priority = qualities(('low', 'h264', 'high'))(quality) + if traverse_obj(quality_meta, ('hlsaddr', 0, {url_or_none})): + formats.append({ + 'format_id': quality, + 'url': quality_meta['hlsaddr'][0], + 'ext': 'mp4', + 'vcodec': vcodec, + 'quality': quality_priority, + }) + if traverse_obj(quality_meta, ('hlsaddr_audioonly', 0, {url_or_none})): + formats.append({ + 'format_id': join_nonempty(quality, 'audioonly'), + 'url': quality_meta['hlsaddr_audioonly'][0], + 'ext': 'm4a', + 'vcodec': 'none', + 'quality': quality_priority, + }) + + def _real_extract(self, url): + video_id = self._match_id(url) + html = self._download_webpage(url, video_id) + + if '>タイムシフトが見つかりませんでした。<' in html: + raise ExtractorError('This stream has expired', expected=True) + + title = clean_html( + get_element_by_id('livetitle', html.replace('', 'span>'))) + description = self._html_search_meta('Description', html) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], html) + + if self._search_regex(r'(var\s+timeshift\s*=\s*false)', html, 'is livestream', default=False): + formats = [] + for (desc, code) in [('high', 'Z'), ('low', 'ForceLow')]: + quality_meta = self._get_quality_meta(video_id, desc, code) + self._add_quality_formats(formats, quality_meta) + if desc == 'high' and traverse_obj(quality_meta, ('vcodec', 0)) == 'HEVC': + self._add_quality_formats( + formats, self._get_quality_meta(video_id, desc, code, force_h264='1')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } + + # VOD extraction + player_html = self._download_webpage( + 'https://live.erinn.biz/live.timeshift.fplayer.php', video_id, + 'Downloading player html', 'Unable to download player html', query={'hash': video_id}) + + sources = traverse_obj(self._search_json( + r'var\s+fplayer_source\s*=', player_html, 'stream data', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json), lambda _, v: v['file']) + + def entries(segments, playlist=True): + for i, segment in enumerate(segments, 1): + yield { + 'id': f'{video_id}_{i}' if playlist else video_id, + 'title': f'{title} (Part {i})' if playlist else title, + 'description': description, + 'timestamp': traverse_obj(segment, ('time_start', {int_or_none})), + 'thumbnail': thumbnail, + 'formats': [{ + 'url': urljoin('https://live.erinn.biz', segment['file']), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + if len(sources) == 1: + return next(entries(sources, playlist=False)) + + return self.playlist_result(entries(sources), video_id, title, description, multi_video=True) From e641aab7a61df7406df60ebfe0c77bd5186b2b41 Mon Sep 17 00:00:00 2001 From: ArnauvGilotra Date: Fri, 19 Jan 2024 20:57:34 +0530 Subject: [PATCH 18/61] [ie/AmadeusTV] Add extractor (#8744) Closes #8155 Authored by: ArnauvGilotra --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/amadeustv.py | 77 +++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 yt_dlp/extractor/amadeustv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index aacb08fb67..8a7f62ccd8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -93,6 +93,7 @@ AluraIE, AluraCourseIE ) +from .amadeustv import AmadeusTVIE from .amara import AmaraIE from .amcnetworks import AMCNetworksIE from .amazon import ( diff --git a/yt_dlp/extractor/amadeustv.py b/yt_dlp/extractor/amadeustv.py new file mode 100644 index 0000000000..2f5ca9137a --- /dev/null +++ b/yt_dlp/extractor/amadeustv.py @@ -0,0 +1,77 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class AmadeusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amadeus\.tv/library/(?P[\da-f]+)' + _TESTS = [{ + 'url': 'http://www.amadeus.tv/library/65091a87ff85af59d9fc54c3', + 'info_dict': { + 'id': '5576678021301411311', + 'ext': 'mp4', + 'title': 'Jieon Park - 第五届珠海莫扎特国际青少年音乐周小提琴C组第三轮', + 'thumbnail': 'http://1253584441.vod2.myqcloud.com/a0046a27vodtransbj1253584441/7db4af535576678021301411311/coverBySnapshot_10_0.jpg', + 'duration': 1264.8, + 'upload_date': '20230918', + 'timestamp': 1695034800, + 'display_id': '65091a87ff85af59d9fc54c3', + 'view_count': int, + 'description': 'md5:a0357b9c215489e2067cbae0b777bb95', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nuxt_data = self._search_nuxt_data(webpage, display_id, traverse=('fetch', '0')) + video_id = traverse_obj(nuxt_data, ('item', 'video', {str})) + + if not video_id: + raise ExtractorError('Unable to extract actual video ID') + + video_data = self._download_json( + f'http://playvideo.qcloud.com/getplayinfo/v2/1253584441/{video_id}', + video_id, headers={'Referer': 'http://www.amadeus.tv/'}) + + formats = [] + for video in traverse_obj(video_data, ('videoInfo', ('sourceVideo', ('transcodeList', ...)), {dict})): + if not url_or_none(video.get('url')): + continue + formats.append({ + **traverse_obj(video, { + 'url': 'url', + 'format_id': ('definition', {lambda x: f'http-{x or "0"}'}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': (('totalSize', 'size'), {int_or_none}), + 'vcodec': ('videoStreamList', 0, 'codec'), + 'acodec': ('audioStreamList', 0, 'codec'), + 'fps': ('videoStreamList', 0, 'fps', {float_or_none}), + }, get_all=False), + 'http_headers': {'Referer': 'http://www.amadeus.tv/'}, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + **traverse_obj(video_data, { + 'title': ('videoInfo', 'basicInfo', 'name', {str}), + 'thumbnail': ('coverInfo', 'coverUrl', {url_or_none}), + 'duration': ('videoInfo', 'sourceVideo', ('floatDuration', 'duration'), {float_or_none}), + }, get_all=False), + **traverse_obj(nuxt_data, ('item', { + 'title': (('title', 'title_en', 'title_cn'), {str}), + 'description': (('description', 'description_en', 'description_cn'), {str}), + 'timestamp': ('date', {parse_iso8601}), + 'view_count': ('view', {int_or_none}), + }), get_all=False), + } From 8ab84650837e58046430c9f4b615c56a8886e071 Mon Sep 17 00:00:00 2001 From: ufukk <5383665+ufukk@users.noreply.github.com> Date: Fri, 19 Jan 2024 18:38:39 +0300 Subject: [PATCH 19/61] [ie/TrtWorld] Add extractor (#8701) Closes #8455 Authored by: ufukk --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/trtworld.py | 101 ++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 yt_dlp/extractor/trtworld.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8a7f62ccd8..489f638f4b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2011,6 +2011,7 @@ TrovoChannelClipIE, ) from .trtcocuk import TrtCocukVideoIE +from .trtworld import TrtWorldIE from .trueid import TrueIDIE from .trunews import TruNewsIE from .truth import TruthIE diff --git a/yt_dlp/extractor/trtworld.py b/yt_dlp/extractor/trtworld.py new file mode 100644 index 0000000000..dbb72a4fe7 --- /dev/null +++ b/yt_dlp/extractor/trtworld.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, determine_ext, parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj + + +class TrtWorldIE(InfoExtractor): + _VALID_URL = r'https?://www\.trtworld\.com/video/[\w-]+/[\w-]+-(?P\d+)' + + _TESTS = [{ + 'url': 'https://www.trtworld.com/video/news/turkiye-switches-to-sustainable-tourism-16067690', + 'info_dict': { + 'id': '16067690', + 'ext': 'mp4', + 'title': 'Türkiye switches to sustainable tourism', + 'release_timestamp': 1701529569, + 'release_date': '20231202', + 'thumbnail': 'https://cdn-i.pr.trt.com.tr/trtworld/17647563_0-0-1920-1080.jpeg', + 'description': 'md5:0a975c04257fb529c8f99c7b76a2cf12', + } + }, { + 'url': 'https://www.trtworld.com/video/one-offs/frames-from-anatolia-recreating-a-james-bond-scene-in-istanbuls-grand-bazaar-14541780', + 'info_dict': { + 'id': '14541780', + 'ext': 'mp4', + 'title': 'Frames From Anatolia: Recreating a ‘James Bond’ Scene in Istanbul’s Grand Bazaar', + 'release_timestamp': 1692440844, + 'release_date': '20230819', + 'thumbnail': 'https://cdn-i.pr.trt.com.tr/trtworld/16939810_0-0-1920-1080.jpeg', + 'description': 'md5:4050e21570cc3c40b6c9badae800a94f', + } + }, { + 'url': 'https://www.trtworld.com/video/the-newsmakers/can-sudan-find-peace-amidst-failed-transition-to-democracy-12904760', + 'info_dict': { + 'id': '12904760', + 'ext': 'mp4', + 'title': 'Can Sudan find peace amidst failed transition to democracy?', + 'release_timestamp': 1681972747, + 'release_date': '20230420', + 'thumbnail': 'http://cdni0.trtworld.com/w768/q70/154214_NMYOUTUBETEMPLATE1_1681833018736.jpg' + } + }, { + 'url': 'https://www.trtworld.com/video/africa-matters/locals-learning-to-cope-with-rising-tides-of-kenyas-great-lakes-16059545', + 'info_dict': { + 'id': 'zEns2dWl00w', + 'ext': 'mp4', + 'title': "Locals learning to cope with rising tides of Kenya's Great Lakes", + 'thumbnail': 'https://i.ytimg.com/vi/zEns2dWl00w/maxresdefault.jpg', + 'description': 'md5:3ad9d7c5234d752a4ead4340c79c6b8d', + 'channel_id': 'UC7fWeaHhqgM4Ry-RMpM2YYw', + 'channel_url': 'https://www.youtube.com/channel/UC7fWeaHhqgM4Ry-RMpM2YYw', + 'duration': 210, + 'view_count': int, + 'age_limit': 0, + 'webpage_url': 'https://www.youtube.com/watch?v=zEns2dWl00w', + 'categories': ['News & Politics'], + 'channel': 'TRT World', + 'channel_follower_count': int, + 'channel_is_verified': True, + 'uploader': 'TRT World', + 'uploader_id': '@trtworld', + 'uploader_url': 'https://www.youtube.com/@trtworld', + 'upload_date': '20231202', + 'availability': 'public', + 'comment_count': int, + 'playable_in_embed': True, + 'tags': [], + 'live_status': 'not_live', + 'like_count': int, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + nuxtjs_data = self._search_nuxt_data(webpage, display_id)['videoData']['content']['platforms'] + formats = [] + for media_url in traverse_obj(nuxtjs_data, ( + ('website', 'ott'), 'metadata', ('hls_url', 'url'), {url_or_none})): + # NB: Website sometimes serves mp4 files under `hls_url` key + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(media_url, display_id, fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': media_url, + }) + if not formats: + if youtube_id := traverse_obj(nuxtjs_data, ('youtube', 'metadata', 'youtubeId')): + return self.url_result(youtube_id, 'Youtube') + raise ExtractorError('No video found', expected=True) + + return { + 'id': display_id, + 'formats': formats, + **traverse_obj(nuxtjs_data, (('website', 'ott'), { + 'title': ('fields', 'title', 'text', {str}), + 'description': ('fields', 'description', 'text', {str}), + 'thumbnail': ('fields', 'thumbnail', 'url', {url_or_none}), + 'release_timestamp': ('published', 'date', {parse_iso8601}), + }), get_all=False), + } From 5154dc0a687528f995cde22b5ff63f82c740e98a Mon Sep 17 00:00:00 2001 From: alien-developers <154035958+alien-developers@users.noreply.github.com> Date: Fri, 19 Jan 2024 21:18:45 +0530 Subject: [PATCH 20/61] [ie/JioSaavnSong] Support more bitrates (#8834) Authored by: alien-developers, bashonly Co-authored-by: bashonly --- README.md | 3 +++ yt_dlp/extractor/jiosaavn.py | 50 +++++++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 16947ce30b..b6a79667c3 100644 --- a/README.md +++ b/README.md @@ -1888,6 +1888,9 @@ #### nhkradirulive (NHK らじる★らじる LIVE) #### nflplusreplay * `type`: Type(s) of game replays to extract. Valid types are: `full_game`, `full_game_spanish`, `condensed_game` and `all_22`. You can use `all` to extract all available replay types, which is the default +#### jiosaavn +* `bitrate`: Audio bitrates to request. One or more of `16`, `32`, `64`, `128`, `320`. Default is `128,320` + **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 552b73f717..a592098359 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from ..utils import ( + int_or_none, js_to_json, url_or_none, urlencode_postdata, @@ -20,39 +21,64 @@ class JioSaavnSongIE(JioSaavnBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', - 'md5': '7b1f70de088ede3a152ea34aece4df42', + 'md5': '3b84396d15ed9e083c3106f1fa589c04', 'info_dict': { 'id': 'OQsEfQFVUXk', - 'ext': 'mp3', + 'ext': 'mp4', 'title': 'Leja Re', 'album': 'Leja Re', 'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'duration': 205, + 'view_count': int, + 'release_year': 2018, }, }, { 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', 'only_matching': True, }] + _VALID_BITRATES = ('16', '32', '64', '128', '320') + def _real_extract(self, url): audio_id = self._match_id(url) + extract_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') + if invalid_bitrates := [br for br in extract_bitrates if br not in self._VALID_BITRATES]: + raise ValueError( + f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. ' + + f'Valid bitrates are: {", ".join(self._VALID_BITRATES)}') + song_data = self._extract_initial_data(url, audio_id)['song']['song'] - media_data = self._download_json( - 'https://www.jiosaavn.com/api.php', audio_id, data=urlencode_postdata({ - '__call': 'song.generateAuthToken', - '_format': 'json', - 'bitrate': '128', - 'url': song_data['encrypted_media_url'], - })) + formats = [] + for bitrate in extract_bitrates: + media_data = self._download_json( + 'https://www.jiosaavn.com/api.php', audio_id, f'Downloading format info for {bitrate}', + fatal=False, data=urlencode_postdata({ + '__call': 'song.generateAuthToken', + '_format': 'json', + 'bitrate': bitrate, + 'url': song_data['encrypted_media_url'], + })) + if not media_data.get('auth_url'): + self.report_warning(f'Unable to extract format info for {bitrate}') + continue + formats.append({ + 'url': media_data['auth_url'], + 'ext': media_data.get('type'), + 'format_id': bitrate, + 'abr': int(bitrate), + 'vcodec': 'none', + }) return { 'id': audio_id, - 'url': media_data['auth_url'], - 'ext': media_data.get('type'), - 'vcodec': 'none', + 'formats': formats, **traverse_obj(song_data, { 'title': ('title', 'text'), 'album': ('album', 'text'), 'thumbnail': ('image', 0, {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'release_year': ('year', {int_or_none}), }), } From 12f042740550c06552819374e2251deb7a519bab Mon Sep 17 00:00:00 2001 From: Snack Date: Sat, 20 Jan 2024 01:16:07 +0900 Subject: [PATCH 21/61] [ie/asobichannel] Add extractors (#8700) Authored by: Snack-X --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/asobichannel.py | 168 +++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 yt_dlp/extractor/asobichannel.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 489f638f4b..eca45019e6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -145,6 +145,7 @@ ArteTVCategoryIE, ) from .arnes import ArnesIE +from .asobichannel import AsobiChannelIE, AsobiChannelTagURLIE from .atresplayer import AtresPlayerIE from .atscaleconf import AtScaleConfEventIE from .atvat import ATVAtIE diff --git a/yt_dlp/extractor/asobichannel.py b/yt_dlp/extractor/asobichannel.py new file mode 100644 index 0000000000..e3479ede99 --- /dev/null +++ b/yt_dlp/extractor/asobichannel.py @@ -0,0 +1,168 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + merge_dicts, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class AsobiChannelBaseIE(InfoExtractor): + _MICROCMS_HEADER = {'X-MICROCMS-API-KEY': 'qRaKehul9AHU8KtL0dnq1OCLKnFec6yrbcz3'} + + def _extract_info(self, metadata): + return traverse_obj(metadata, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('body', {clean_html}), + 'thumbnail': ('contents', 'video_thumb', 'url', {url_or_none}), + 'timestamp': ('publishedAt', {parse_iso8601}), + 'modified_timestamp': ('updatedAt', {parse_iso8601}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'id', {str}), + }) + + +class AsobiChannelIE(AsobiChannelBaseIE): + IE_NAME = 'asobichannel' + IE_DESC = 'ASOBI CHANNEL' + + _VALID_URL = r'https?://asobichannel\.asobistore\.jp/watch/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://asobichannel.asobistore.jp/watch/1ypp48qd32p', + 'md5': '39df74e872afe032c4eb27b89144fc92', + 'info_dict': { + 'id': '1ypp48qd32p', + 'ext': 'mp4', + 'title': 'アイドルマスター ミリオンライブ! 765プロch 原っぱ通信 #1', + 'description': 'md5:b930bd2199c9b2fd75951ce4aaa7efd2', + 'thumbnail': 'https://images.microcms-assets.io/assets/d2420de4b9194e11beb164f99edb1f95/a8e6f84119f54eb9ab4ce16729239905/%E3%82%B5%E3%83%A0%E3%83%8D%20(1).png', + 'timestamp': 1697098247, + 'upload_date': '20231012', + 'modified_timestamp': 1698381162, + 'modified_date': '20231027', + 'channel': 'アイドルマスター', + 'channel_id': 'idolmaster', + }, + }, { + 'url': 'https://asobichannel.asobistore.jp/watch/redigiwnjzqj', + 'md5': '229fa8fb5c591c75ce8c37a497f113f6', + 'info_dict': { + 'id': 'redigiwnjzqj', + 'ext': 'mp4', + 'title': '【おまけ放送】アイドルマスター ミリオンライブ! 765プロch 原っぱ通信 #1', + 'description': 'md5:7d9cd35fb54425a6967822bd564ea2d9', + 'thumbnail': 'https://images.microcms-assets.io/assets/d2420de4b9194e11beb164f99edb1f95/20e5c1d6184242eebc2512a5dec59bf0/P1_%E5%8E%9F%E3%81%A3%E3%81%B1%E3%82%B5%E3%83%A0%E3%83%8D.png', + 'modified_timestamp': 1697797125, + 'modified_date': '20231020', + 'timestamp': 1697261769, + 'upload_date': '20231014', + 'channel': 'アイドルマスター', + 'channel_id': 'idolmaster', + }, + }] + + _survapi_header = None + + def _real_initialize(self): + token = self._download_json( + 'https://asobichannel-api.asobistore.jp/api/v1/vspf/token', None, + note='Retrieving API token') + self._survapi_header = {'Authorization': f'Bearer {token}'} + + def _process_vod(self, video_id, metadata): + content_id = metadata['contents']['video_id'] + + vod_data = self._download_json( + f'https://survapi.channel.or.jp/proxy/v1/contents/{content_id}/get_by_cuid', video_id, + headers=self._survapi_header, note='Downloading vod data') + + return { + 'formats': self._extract_m3u8_formats(vod_data['ex_content']['streaming_url'], video_id), + } + + def _process_live(self, video_id, metadata): + content_id = metadata['contents']['video_id'] + event_data = self._download_json( + f'https://survapi.channel.or.jp/ex/events/{content_id}?embed=channel', video_id, + headers=self._survapi_header, note='Downloading event data') + + player_type = traverse_obj(event_data, ('data', 'Player_type', {str})) + if player_type == 'poster': + self.raise_no_formats('Live event has not yet started', expected=True) + live_status = 'is_upcoming' + formats = [] + elif player_type == 'player': + live_status = 'is_live' + formats = self._extract_m3u8_formats( + event_data['data']['Channel']['Custom_live_url'], video_id, live=True) + else: + raise ExtractorError('Unsupported player type {player_type!r}') + + return { + 'release_timestamp': traverse_obj(metadata, ('period', 'start', {parse_iso8601})), + 'live_status': live_status, + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._download_json( + f'https://channel.microcms.io/api/v1/media/{video_id}', video_id, + headers=self._MICROCMS_HEADER) + + info = self._extract_info(metadata) + + video_type = traverse_obj(metadata, ('contents', 'video_type', 0, {str})) + if video_type == 'VOD': + return merge_dicts(info, self._process_vod(video_id, metadata)) + if video_type == 'LIVE': + return merge_dicts(info, self._process_live(video_id, metadata)) + + raise ExtractorError(f'Unexpected video type {video_type!r}') + + +class AsobiChannelTagURLIE(AsobiChannelBaseIE): + IE_NAME = 'asobichannel:tag' + IE_DESC = 'ASOBI CHANNEL' + + _VALID_URL = r'https?://asobichannel\.asobistore\.jp/tag/(?P[a-z0-9-_]+)' + _TESTS = [{ + 'url': 'https://asobichannel.asobistore.jp/tag/bjhh-nbcja', + 'info_dict': { + 'id': 'bjhh-nbcja', + 'title': 'アイドルマスター ミリオンライブ! 765プロch 原っぱ通信', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://asobichannel.asobistore.jp/tag/hvm5qw3c6od', + 'info_dict': { + 'id': 'hvm5qw3c6od', + 'title': 'アイマスMOIW2023ラジオ', + }, + 'playlist_mincount': 13, + }] + + def _real_extract(self, url): + tag_id = self._match_id(url) + webpage = self._download_webpage(url, tag_id) + title = traverse_obj(self._search_nextjs_data( + webpage, tag_id, fatal=False), ('props', 'pageProps', 'data', 'name', {str})) + + media = self._download_json( + f'https://channel.microcms.io/api/v1/media?limit=999&filters=(tag[contains]{tag_id})', + tag_id, headers=self._MICROCMS_HEADER) + + def entries(): + for metadata in traverse_obj(media, ('contents', lambda _, v: v['id'])): + yield { + '_type': 'url', + 'url': f'https://asobichannel.asobistore.jp/watch/{metadata["id"]}', + 'ie_key': AsobiChannelIE.ie_key(), + **self._extract_info(metadata), + } + + return self.playlist_result(entries(), tag_id, title) From 1a36dbad712d359ec1c5b73d9bbbe562c03e9660 Mon Sep 17 00:00:00 2001 From: SirElderling <148036781+SirElderling@users.noreply.github.com> Date: Fri, 19 Jan 2024 16:29:48 +0000 Subject: [PATCH 22/61] [ie/RinseFMArtistPlaylist] Add extractor (#8794) Authored by: SirElderling --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/rinsefm.py | 78 ++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index eca45019e6..3c94be8b48 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1604,7 +1604,10 @@ from .reuters import ReutersIE from .reverbnation import ReverbNationIE from .rheinmaintv import RheinMainTVIE -from .rinsefm import RinseFMIE +from .rinsefm import ( + RinseFMIE, + RinseFMArtistPlaylistIE, +) from .rmcdecouverte import RMCDecouverteIE from .rockstargames import RockstarGamesIE from .rokfin import ( diff --git a/yt_dlp/extractor/rinsefm.py b/yt_dlp/extractor/rinsefm.py index 760adf0eba..f87b895df8 100644 --- a/yt_dlp/extractor/rinsefm.py +++ b/yt_dlp/extractor/rinsefm.py @@ -1,8 +1,34 @@ from .common import InfoExtractor -from ..utils import format_field, parse_iso8601 +from ..utils import ( + MEDIA_EXTENSIONS, + determine_ext, + parse_iso8601, + traverse_obj, + url_or_none, +) -class RinseFMIE(InfoExtractor): +class RinseFMBaseIE(InfoExtractor): + @staticmethod + def _parse_entry(entry): + return { + **traverse_obj(entry, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'url': ('fileUrl', {url_or_none}), + 'release_timestamp': ('episodeDate', {parse_iso8601}), + 'thumbnail': ('featuredImage', 0, 'filename', {str}, + {lambda x: x and f'https://rinse.imgix.net/media/{x}'}), + 'webpage_url': ('slug', {str}, + {lambda x: x and f'https://rinse.fm/episodes/{x}'}), + }), + 'vcodec': 'none', + 'extractor_key': RinseFMIE.ie_key(), + 'extractor': RinseFMIE.IE_NAME, + } + + +class RinseFMIE(RinseFMBaseIE): _VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/', @@ -22,12 +48,42 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry'] - return { - 'id': entry['id'], - 'title': entry.get('title'), - 'url': entry['fileUrl'], - 'vcodec': 'none', - 'release_timestamp': parse_iso8601(entry.get('episodeDate')), - 'thumbnail': format_field( - entry, [('featuredImage', 0, 'filename')], 'https://rinse.imgix.net/media/%s', default=None), - } + return self._parse_entry(entry) + + +class RinseFMArtistPlaylistIE(RinseFMBaseIE): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/shows/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/shows/resources/', + 'info_dict': { + 'id': 'resources', + 'title': '[re]sources', + 'description': '[re]sources est un label parisien piloté par le DJ et producteur Tommy Kid.' + }, + 'playlist_mincount': 40 + }, { + 'url': 'https://rinse.fm/shows/ivy/', + 'info_dict': { + 'id': 'ivy', + 'title': '[IVY]', + 'description': 'A dedicated space for DNB/Turbo House and 4x4.' + }, + 'playlist_mincount': 7 + }] + + def _entries(self, data): + for episode in traverse_obj(data, ( + 'props', 'pageProps', 'episodes', lambda _, v: determine_ext(v['fileUrl']) in MEDIA_EXTENSIONS.audio) + ): + yield self._parse_entry(episode) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._og_search_title(webpage) or self._html_search_meta('title', webpage) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage) + data = self._search_nextjs_data(webpage, playlist_id) + + return self.playlist_result( + self._entries(data), playlist_id, title, description=description) From 5eb1458be4767385a9bf1d570ff08e46100cbaa2 Mon Sep 17 00:00:00 2001 From: Christopher Schreiner Date: Fri, 19 Jan 2024 17:38:21 +0100 Subject: [PATCH 23/61] [ie/adn] Add support for German site (#8708) - Add extractor for seasons Closes #6643, Closes #8945 Authored by: infanf --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/adn.py | 114 ++++++++++++++++++++++++++------ 2 files changed, 93 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3c94be8b48..b72b53fdd4 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -47,7 +47,7 @@ ACastChannelIE, ) from .acfun import AcFunVideoIE, AcFunBangumiIE -from .adn import ADNIE +from .adn import ADNIE, ADNSeasonIE from .adobeconnect import AdobeConnectIE from .adobetv import ( AdobeTVEmbedIE, diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index b59dbc8500..ed23226a35 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -19,15 +19,35 @@ long_to_bytes, pkcs1pad, strip_or_none, + str_or_none, try_get, unified_strdate, urlencode_postdata, ) +from ..utils.traversal import traverse_obj -class ADNIE(InfoExtractor): +class ADNBaseIE(InfoExtractor): IE_DESC = 'Animation Digital Network' - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P\d+)' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = f'https://gw.api.{_BASE}/' + _PLAYER_BASE_URL = f'{_API_BASE_URL}player/' + _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' + _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) + _POS_ALIGN_MAP = { + 'start': 1, + 'end': 3, + } + _LINE_ALIGN_MAP = { + 'middle': 8, + 'end': 4, + } + + +class ADNIE(ADNBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.(?Pfr|de)/video/[^/?#]+/(?P\d+)' _TESTS = [{ 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', 'md5': '1c9ef066ceb302c86f80c2b371615261', @@ -44,29 +64,35 @@ class ADNIE(InfoExtractor): 'season_number': 1, 'episode': 'À ce soir !', 'episode_number': 1, + 'thumbnail': str, + 'season': 'Season 1', }, - 'skip': 'Only available in region (FR, ...)', + 'skip': 'Only available in French and German speaking Europe', }, { 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', 'only_matching': True, + }, { + 'url': 'https://animationdigitalnetwork.de/video/the-eminence-in-shadow/23550-folge-1', + 'md5': '5c5651bf5791fa6fcd7906012b9d94e8', + 'info_dict': { + 'id': '23550', + 'ext': 'mp4', + 'episode_number': 1, + 'duration': 1417, + 'release_date': '20231004', + 'series': 'The Eminence in Shadow', + 'season_number': 2, + 'episode': str, + 'title': str, + 'thumbnail': str, + 'season': 'Season 2', + 'comment_count': int, + 'average_rating': float, + 'description': str, + }, + # 'skip': 'Only available in French and German speaking Europe', }] - _NETRC_MACHINE = 'animationdigitalnetwork' - _BASE = 'animationdigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.' + _BASE + '/' - _PLAYER_BASE_URL = _API_BASE_URL + 'player/' - _HEADERS = {} - _LOGIN_ERR_MESSAGE = 'Unable to log in' - _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) - _POS_ALIGN_MAP = { - 'start': 1, - 'end': 3, - } - _LINE_ALIGN_MAP = { - 'middle': 8, - 'end': 4, - } - def _get_subtitles(self, sub_url, video_id): if not sub_url: return None @@ -116,6 +142,8 @@ def _get_subtitles(self, sub_url, video_id): if sub_lang == 'vostf': sub_lang = 'fr' + elif sub_lang == 'vostde': + sub_lang = 'de' subtitles.setdefault(sub_lang, []).extend([{ 'ext': 'json', 'data': json.dumps(sub), @@ -147,7 +175,7 @@ def _perform_login(self, username, password): self.report_warning(message or self._LOGIN_ERR_MESSAGE) def _real_extract(self, url): - video_id = self._match_id(url) + lang, video_id = self._match_valid_url(url).group('lang', 'id') video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, @@ -162,7 +190,7 @@ def _real_extract(self, url): token = self._download_json( user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), video_id, 'Downloading access token', headers={ - 'x-player-refresh-token': user['refreshToken'] + 'X-Player-Refresh-Token': user['refreshToken'], }, data=b'')['token'] links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') @@ -184,7 +212,9 @@ def _real_extract(self, url): try: links_data = self._download_json( links_url, video_id, 'Downloading links JSON metadata', headers={ - 'X-Player-Token': authorization + 'X-Player-Token': authorization, + 'X-Target-Distribution': lang, + **self._HEADERS }, query={ 'freeWithAds': 'true', 'adaptive': 'false', @@ -232,6 +262,9 @@ def _real_extract(self, url): if format_id == 'vf': for f in m3u8_formats: f['language'] = 'fr' + elif format_id == 'vde': + for f in m3u8_formats: + f['language'] = 'de' formats.extend(m3u8_formats) video = (self._download_json( @@ -255,3 +288,40 @@ def _real_extract(self, url): 'average_rating': float_or_none(video.get('rating') or metas.get('rating')), 'comment_count': int_or_none(video.get('commentsCount')), } + + +class ADNSeasonIE(ADNBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.(?Pfr|de)/video/(?P[^/?#]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/tokyo-mew-mew-new', + 'playlist_count': 12, + 'info_dict': { + 'id': '911', + 'title': 'Tokyo Mew Mew New', + }, + # 'skip': 'Only available in French end German speaking Europe', + }] + + def _real_extract(self, url): + lang, video_show_slug = self._match_valid_url(url).group('lang', 'id') + show = self._download_json( + f'{self._API_BASE_URL}show/{video_show_slug}/', video_show_slug, + 'Downloading show JSON metadata', headers=self._HEADERS)['show'] + show_id = str(show['id']) + episodes = self._download_json( + f'{self._API_BASE_URL}video/show/{show_id}', video_show_slug, + 'Downloading episode list', headers={ + 'X-Target-Distribution': lang, + **self._HEADERS + }, query={ + 'order': 'asc', + 'limit': '-1', + }) + + def entries(): + for episode_id in traverse_obj(episodes, ('videos', ..., 'id', {str_or_none})): + yield self.url_result( + f'https://animationdigitalnetwork.{lang}/video/{video_show_slug}/{episode_id}', + ADNIE, episode_id) + + return self.playlist_result(entries(), show_id, show.get('title')) From 4a07a455bbf7acf87550053bbba949c828e350ba Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Fri, 19 Jan 2024 08:49:15 -0800 Subject: [PATCH 24/61] [ie/GoPro] Fix extractor (#9019) Authored by: stilor --- yt_dlp/extractor/gopro.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/gopro.py b/yt_dlp/extractor/gopro.py index ae965374cf..ec1595bc50 100644 --- a/yt_dlp/extractor/gopro.py +++ b/yt_dlp/extractor/gopro.py @@ -57,8 +57,8 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - metadata = self._parse_json( - self._html_search_regex(r'window\.__reflectData\s*=\s*([^;]+)', webpage, 'metadata'), video_id) + metadata = self._search_json( + r'window\.__reflectData\s*=', webpage, 'metadata', video_id) video_info = metadata['collectionMedia'][0] media_data = self._download_json( @@ -99,7 +99,7 @@ def _real_extract(self, url): 'duration': int_or_none( video_info.get('source_duration')), 'artist': str_or_none( - video_info.get('music_track_artist')), + video_info.get('music_track_artist')) or None, 'track': str_or_none( - video_info.get('music_track_name')), + video_info.get('music_track_name')) or None, } From 1713c882730a928ac344c099874d2093fc2c8b51 Mon Sep 17 00:00:00 2001 From: SirElderling <148036781+SirElderling@users.noreply.github.com> Date: Fri, 19 Jan 2024 20:11:00 +0000 Subject: [PATCH 25/61] [ie/bilibili] Add referer header and fix metadata extraction (#8832) Closes #6640 Authored by: SirElderling --- yt_dlp/extractor/bilibili.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 5475b3650b..cd7df69ef0 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1622,6 +1622,7 @@ def _real_extract(self, url): class BiliIntlBaseIE(InfoExtractor): _API_URL = 'https://api.bilibili.tv/intl/gateway' _NETRC_MACHINE = 'biliintl' + _HEADERS = {'Referer': 'https://www.bilibili.com/'} def _call_api(self, endpoint, *args, **kwargs): json = self._download_json(self._API_URL + endpoint, *args, **kwargs) @@ -1732,7 +1733,9 @@ def _get_formats(self, *, ep_id=None, aid=None): def _parse_video_metadata(self, video_data): return { 'title': video_data.get('title_display') or video_data.get('title'), + 'description': video_data.get('desc'), 'thumbnail': video_data.get('cover'), + 'timestamp': unified_timestamp(video_data.get('formatted_pub_date')), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), } @@ -1829,17 +1832,6 @@ class BiliIntlIE(BiliIntlBaseIE): 'episode_number': 140, }, 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' - }, { - 'url': 'https://www.bilibili.tv/en/video/2041863208', - 'info_dict': { - 'id': '2041863208', - 'ext': 'mp4', - 'timestamp': 1670874843, - 'description': 'Scheduled for April 2023.\nStudio: ufotable', - 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', - 'upload_date': '20221212', - 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', - }, }, { # episode comment extraction 'url': 'https://www.bilibili.tv/en/play/34580/340317', @@ -1880,9 +1872,9 @@ class BiliIntlIE(BiliIntlBaseIE): 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', 'timestamp': 1667891924, 'upload_date': '20221108', - 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan', 'comment_count': int, - 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + 'thumbnail': r're:https://pic\.bstarstatic\.(?:com|net)/ugc/f6c363659efd2eabe5683fbb906b1582\.jpg', }, 'params': { 'getcomments': True @@ -1945,10 +1937,12 @@ def _extract_video_metadata(self, url, video_id, season_id): # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found return merge_dicts( - self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { - 'title': self._html_search_meta('og:title', webpage), - 'description': self._html_search_meta('og:description', webpage) - }) + self._parse_video_metadata(video_data), { + 'title': get_element_by_class( + 'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage), + 'description': get_element_by_class( + 'bstar-meta__desc', webpage) or self._html_search_meta('og:description'), + }, self._search_json_ld(webpage, video_id, default={})) def _get_comments_reply(self, root_id, next_id=0, display_id=None): comment_api_raw_data = self._download_json( @@ -2036,7 +2030,8 @@ def _real_extract(self, url): 'formats': self._get_formats(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), 'chapters': chapters, - '__post_extractor': self.extract_comments(video_id, ep_id) + '__post_extractor': self.extract_comments(video_id, ep_id), + 'http_headers': self._HEADERS, } From 4310b6650eeb5630295f4591b37720877878c57a Mon Sep 17 00:00:00 2001 From: divStar Date: Fri, 19 Jan 2024 21:27:16 +0100 Subject: [PATCH 26/61] [ie/getcourseru] Add extractors (#8873) Authored by: divStar, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/getcourseru.py | 179 ++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 yt_dlp/extractor/getcourseru.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b72b53fdd4..3d5c3eb60f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -687,6 +687,10 @@ GeniusIE, GeniusLyricsIE, ) +from .getcourseru import ( + GetCourseRuPlayerIE, + GetCourseRuIE +) from .gettr import ( GettrIE, GettrStreamingIE, diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py new file mode 100644 index 0000000000..6fdbcd7366 --- /dev/null +++ b/yt_dlp/extractor/getcourseru.py @@ -0,0 +1,179 @@ +import re +import time +import urllib.parse + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, url_or_none, urlencode_postdata +from ..utils.traversal import traverse_obj + + +class GetCourseRuPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] + _TESTS = [{ + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', + 'info_dict': { + 'id': '513573381', + 'title': '190bdf93f1b29735309853a7a19e24b3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + 'skip': 'JWT expired', + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None, 'Downloading player page') + window_configs = self._search_json( + r'window\.configs\s*=', webpage, 'config', None) + video_id = str(window_configs['gcFileId']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + window_configs['masterPlaylistUrl'], video_id) + + return { + **traverse_obj(window_configs, { + 'title': ('videoHash', {str}), + 'thumbnail': ('previewUrl', {url_or_none}), + 'duration': ('videoDuration', {int_or_none}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class GetCourseRuIE(InfoExtractor): + _NETRC_MACHINE = 'getcourseru' + _DOMAINS = [ + 'academymel.online', + 'marafon.mani-beauty.com', + 'on.psbook.ru' + ] + _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' + _VALID_URL = [ + rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', + rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + ] + _TESTS = [{ + 'url': 'http://academymel.online/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0', + 'info_dict': { + 'id': '319141781', + 'title': '1. Разминка у стены', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4919601', + 'ext': 'mp4', + 'title': '1. Разминка у стены', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81', + 'duration': 704 + }, + }], + 'skip': 'paid lesson' + }, { + 'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894', + 'info_dict': { + 'id': '272499894', + 'title': 'Мотивация к тренировкам', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '447479687', + 'ext': 'mp4', + 'title': 'Мотивация к тренировкам', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71', + 'duration': 30 + }, + }], + 'skip': 'paid lesson' + }, { + 'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT', + 'only_matching': True, + }] + + _LOGIN_URL_PATH = '/cms/system/login' + + def _login(self, hostname, username, password): + if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): + return + login_url = f'https://{hostname}{self._LOGIN_URL_PATH}' + webpage = self._download_webpage(login_url, None) + + self._request_webpage( + login_url, None, 'Logging in', 'Failed to log in', + data=urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': self._html_search_regex( + r']+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', + webpage, 'xdgetId'), + 'params[action]': 'login', + 'params[url]': login_url, + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time.time()), + 'requestSimpleSign': self._html_search_regex( + r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', webpage, 'simple sign'), + })) + + def _real_extract(self, url): + hostname = urllib.parse.urlparse(url).hostname + username, password = self._get_login_info(netrc_machine=hostname) + if username: + self._login(hostname, username, password) + + display_id = self._match_id(url) + # NB: 404 is returned due to yt-dlp not properly following redirects #9020 + webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404) + if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404: + raise ExtractorError( + f'This video is only available for registered users. {self._login_hint("any", netrc=hostname)}', + expected=True) + + playlist_id = self._search_regex( + r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) + title = self._og_search_title(webpage) or self._html_extract_title(webpage) + + return self.playlist_from_matches( + re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), + playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={ + 'url_transparent': True, + 'title': title, + }) From 50e06e21a68e336198198bda332b8e7d2314f201 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Sat, 20 Jan 2024 05:31:06 +0900 Subject: [PATCH 27/61] [ie/MLBArticle] Fix extractor (#9021) Closes #8682 Authored by: HobbyistDev --- yt_dlp/extractor/mlb.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 72057dc97a..d715b97892 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -355,11 +355,11 @@ class MLBArticleIE(InfoExtractor): 'info_dict': { 'id': '36db7394-343c-4ea3-b8ca-ead2e61bca9a', 'title': 'Machado\'s grab draws hilarious irate reaction', - 'modified_timestamp': 1650130737, + 'modified_timestamp': 1675888370, 'description': 'md5:a19d4eb0487b2cb304e9a176f6b67676', - 'modified_date': '20220416', + 'modified_date': '20230208', }, - 'playlist_count': 2, + 'playlist_mincount': 2, }] def _real_extract(self, url): @@ -367,15 +367,13 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) apollo_cache_json = self._search_json(r'window\.initState\s*=', webpage, 'window.initState', display_id)['apolloCache'] - content_data_id = traverse_obj( - apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getForgeContent'), 'id'), get_all=False) - - content_real_info = apollo_cache_json[content_data_id] + content_real_info = traverse_obj( + apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getArticle')), get_all=False) return self.playlist_from_matches( - traverse_obj(content_real_info, ('parts', lambda _, v: v['typename'] == 'Video', 'id')), - getter=lambda x: f'https://www.mlb.com/video/{apollo_cache_json[x]["slug"]}', - ie=MLBVideoIE, playlist_id=content_real_info.get('_translationId'), + traverse_obj(content_real_info, ('parts', lambda _, v: v['__typename'] == 'Video' or v['type'] == 'video')), + getter=lambda x: f'https://www.mlb.com/video/{x["slug"]}', + ie=MLBVideoIE, playlist_id=content_real_info.get('translationId'), title=self._html_search_meta('og:title', webpage), description=content_real_info.get('summary'), modified_timestamp=parse_iso8601(content_real_info.get('lastUpdatedDate'))) From 69d31914952dd33082ac7019c6f76b43c45b9d06 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 20 Jan 2024 10:39:49 +1300 Subject: [PATCH 28/61] [test] Skip source address tests if the address cannot be bound to (#8900) Fixes https://github.com/yt-dlp/yt-dlp/issues/8890 Authored by: coletdjnz --- test/helper.py | 7 ++++++- test/test_networking.py | 5 ++++- test/test_socks.py | 4 +++- test/test_websockets.py | 3 +++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/test/helper.py b/test/helper.py index e5ace8fe2c..4aca47025e 100644 --- a/test/helper.py +++ b/test/helper.py @@ -10,7 +10,7 @@ import yt_dlp.extractor from yt_dlp import YoutubeDL from yt_dlp.compat import compat_os_name -from yt_dlp.utils import preferredencoding, try_call, write_string +from yt_dlp.utils import preferredencoding, try_call, write_string, find_available_port if 'pytest' in sys.modules: import pytest @@ -329,3 +329,8 @@ def http_server_port(httpd): else: sock = httpd.socket return sock.getsockname()[1] + + +def verify_address_availability(address): + if find_available_port(address) is None: + pytest.skip(f'Unable to bind to source address {address} (address may not exist)') diff --git a/test/test_networking.py b/test/test_networking.py index dc60ca6994..62325aa8e0 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -26,7 +26,7 @@ from email.message import Message from http.cookiejar import CookieJar -from test.helper import FakeYDL, http_server_port +from test.helper import FakeYDL, http_server_port, verify_address_availability from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.dependencies import brotli, requests, urllib3 from yt_dlp.networking import ( @@ -538,6 +538,9 @@ def test_timeout(self, handler): @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' + # on some systems these loopback addresses we need for testing may not be available + # see: https://github.com/yt-dlp/yt-dlp/issues/8890 + verify_address_availability(source_address) with handler(source_address=source_address) as rh: data = validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() diff --git a/test/test_socks.py b/test/test_socks.py index 71f783e132..cb22b61dc8 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -25,7 +25,7 @@ ThreadingTCPServer, ) -from test.helper import http_server_port +from test.helper import http_server_port, verify_address_availability from yt_dlp.networking import Request from yt_dlp.networking.exceptions import ProxyError, TransportError from yt_dlp.socks import ( @@ -326,6 +326,7 @@ def test_socks4a_domain_target(self, handler, ctx): def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' + verify_address_availability(source_address) with handler(proxies={'all': f'socks4://{server_address}'}, source_address=source_address) as rh: response = ctx.socks_info_request(rh) @@ -441,6 +442,7 @@ def test_ipv6_socks5_proxy(self, handler, ctx): def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' + verify_address_availability(source_address) with handler(proxies={'all': f'socks5://{server_address}'}, source_address=source_address) as rh: response = ctx.socks_info_request(rh) assert response['client_address'][0] == source_address diff --git a/test/test_websockets.py b/test/test_websockets.py index af6142ea3b..91bac3442e 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -6,6 +6,8 @@ import pytest +from test.helper import verify_address_availability + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import http.client @@ -227,6 +229,7 @@ def test_cookies(self, handler): @pytest.mark.parametrize('handler', ['Websockets'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' + verify_address_availability(source_address) with handler(source_address=source_address) as rh: ws = validate_and_send(rh, Request(self.ws_base_url)) ws.send('source_address') From 811d298b231cfa29e75c321b23a91d1c2b17602c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 20 Jan 2024 15:26:50 +1300 Subject: [PATCH 29/61] [networking] Remove `_CompatHTTPError` (#8871) Use `yt_dlp.networking.exceptions.HTTPError`. `_CompatHTTPError` was to help with transition to the networking framework. Authored by: coletdjnz --- test/test_networking_utils.py | 82 ++-------------------- yt_dlp/YoutubeDL.py | 3 - yt_dlp/compat/_legacy.py | 4 +- yt_dlp/networking/exceptions.py | 116 +------------------------------- 4 files changed, 7 insertions(+), 198 deletions(-) diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index 419aae1e47..b7b71430e7 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -8,13 +8,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import contextlib import io -import platform import random import ssl -import urllib.error -import warnings from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.dependencies import certifi @@ -30,7 +26,6 @@ from yt_dlp.networking.exceptions import ( HTTPError, IncompleteRead, - _CompatHTTPError, ) from yt_dlp.socks import ProxyType from yt_dlp.utils.networking import HTTPHeaderDict @@ -179,11 +174,10 @@ class TestNetworkingExceptions: def create_response(status): return Response(fp=io.BytesIO(b'test'), url='http://example.com', headers={'tesT': 'test'}, status=status) - @pytest.mark.parametrize('http_error_class', [HTTPError, lambda r: _CompatHTTPError(HTTPError(r))]) - def test_http_error(self, http_error_class): + def test_http_error(self): response = self.create_response(403) - error = http_error_class(response) + error = HTTPError(response) assert error.status == 403 assert str(error) == error.msg == 'HTTP Error 403: Forbidden' @@ -194,80 +188,12 @@ def test_http_error(self, http_error_class): assert data == b'test' assert repr(error) == '' - @pytest.mark.parametrize('http_error_class', [HTTPError, lambda *args, **kwargs: _CompatHTTPError(HTTPError(*args, **kwargs))]) - def test_redirect_http_error(self, http_error_class): + def test_redirect_http_error(self): response = self.create_response(301) - error = http_error_class(response, redirect_loop=True) + error = HTTPError(response, redirect_loop=True) assert str(error) == error.msg == 'HTTP Error 301: Moved Permanently (redirect loop detected)' assert error.reason == 'Moved Permanently' - def test_compat_http_error(self): - response = self.create_response(403) - error = _CompatHTTPError(HTTPError(response)) - assert isinstance(error, HTTPError) - assert isinstance(error, urllib.error.HTTPError) - - @contextlib.contextmanager - def raises_deprecation_warning(): - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - yield - - if len(w) == 0: - pytest.fail('Did not raise DeprecationWarning') - if len(w) > 1: - pytest.fail(f'Raised multiple warnings: {w}') - - if not issubclass(w[-1].category, DeprecationWarning): - pytest.fail(f'Expected DeprecationWarning, got {w[-1].category}') - w.clear() - - with raises_deprecation_warning(): - assert error.code == 403 - - with raises_deprecation_warning(): - assert error.getcode() == 403 - - with raises_deprecation_warning(): - assert error.hdrs is error.response.headers - - with raises_deprecation_warning(): - assert error.info() is error.response.headers - - with raises_deprecation_warning(): - assert error.headers is error.response.headers - - with raises_deprecation_warning(): - assert error.filename == error.response.url - - with raises_deprecation_warning(): - assert error.url == error.response.url - - with raises_deprecation_warning(): - assert error.geturl() == error.response.url - - # Passthrough file operations - with raises_deprecation_warning(): - assert error.read() == b'test' - - with raises_deprecation_warning(): - assert not error.closed - - with raises_deprecation_warning(): - # Technically Response operations are also passed through, which should not be used. - assert error.get_header('test') == 'test' - - # Should not raise a warning - error.close() - - @pytest.mark.skipif( - platform.python_implementation() == 'PyPy', reason='garbage collector works differently in pypy') - def test_compat_http_error_autoclose(self): - # Compat HTTPError should not autoclose response - response = self.create_response(403) - _CompatHTTPError(HTTPError(response)) - assert not response.closed - def test_incomplete_read_error(self): error = IncompleteRead(4, 3, cause='test') assert isinstance(error, IncompleteRead) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8d96498a67..5dcefb5b81 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -40,7 +40,6 @@ NoSupportingHandlers, RequestError, SSLError, - _CompatHTTPError, network_exceptions, ) from .plugins import directories as plugin_directories @@ -4110,8 +4109,6 @@ def urlopen(self, req): 'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. ' 'Try using --legacy-server-connect', cause=e) from e raise - except HTTPError as e: # TODO: Remove in a future release - raise _CompatHTTPError(e) from e def build_request_director(self, handlers, preferences=None): logger = _YDLLogger(self) diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index 90ccf0f14a..7ea5d08120 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -35,6 +35,7 @@ from ..dependencies import brotli as compat_brotli # noqa: F401 from ..dependencies import websockets as compat_websockets # noqa: F401 from ..dependencies.Cryptodome import AES as compat_pycrypto_AES # noqa: F401 +from ..networking.exceptions import HTTPError as compat_HTTPError # noqa: F401 passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode')) @@ -70,7 +71,6 @@ def compat_setenv(key, value, env=os.environ): compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser compat_http_client = http.client compat_http_server = http.server -compat_HTTPError = urllib.error.HTTPError compat_input = input compat_integer_types = (int, ) compat_itertools_count = itertools.count @@ -88,7 +88,7 @@ def compat_setenv(key, value, env=os.environ): compat_subprocess_get_DEVNULL = lambda: subprocess.DEVNULL compat_tokenize_tokenize = tokenize.tokenize compat_urllib_error = urllib.error -compat_urllib_HTTPError = urllib.error.HTTPError +compat_urllib_HTTPError = compat_HTTPError compat_urllib_parse = urllib.parse compat_urllib_parse_parse_qs = urllib.parse.parse_qs compat_urllib_parse_quote = urllib.parse.quote diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 12441901c9..9037f18e2a 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -1,9 +1,8 @@ from __future__ import annotations import typing -import urllib.error -from ..utils import YoutubeDLError, deprecation_warning +from ..utils import YoutubeDLError if typing.TYPE_CHECKING: from .common import RequestHandler, Response @@ -101,117 +100,4 @@ class ProxyError(TransportError): pass -class _CompatHTTPError(urllib.error.HTTPError, HTTPError): - """ - Provides backwards compatibility with urllib.error.HTTPError. - Do not use this class directly, use HTTPError instead. - """ - - def __init__(self, http_error: HTTPError): - super().__init__( - url=http_error.response.url, - code=http_error.status, - msg=http_error.msg, - hdrs=http_error.response.headers, - fp=http_error.response - ) - self._closer.close_called = True # Disable auto close - self._http_error = http_error - HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop) - - @property - def status(self): - return self._http_error.status - - @status.setter - def status(self, value): - return - - @property - def reason(self): - return self._http_error.reason - - @reason.setter - def reason(self, value): - return - - @property - def headers(self): - deprecation_warning('HTTPError.headers is deprecated, use HTTPError.response.headers instead') - return self._http_error.response.headers - - @headers.setter - def headers(self, value): - return - - def info(self): - deprecation_warning('HTTPError.info() is deprecated, use HTTPError.response.headers instead') - return self.response.headers - - def getcode(self): - deprecation_warning('HTTPError.getcode is deprecated, use HTTPError.status instead') - return self.status - - def geturl(self): - deprecation_warning('HTTPError.geturl is deprecated, use HTTPError.response.url instead') - return self.response.url - - @property - def code(self): - deprecation_warning('HTTPError.code is deprecated, use HTTPError.status instead') - return self.status - - @code.setter - def code(self, value): - return - - @property - def url(self): - deprecation_warning('HTTPError.url is deprecated, use HTTPError.response.url instead') - return self.response.url - - @url.setter - def url(self, value): - return - - @property - def hdrs(self): - deprecation_warning('HTTPError.hdrs is deprecated, use HTTPError.response.headers instead') - return self.response.headers - - @hdrs.setter - def hdrs(self, value): - return - - @property - def filename(self): - deprecation_warning('HTTPError.filename is deprecated, use HTTPError.response.url instead') - return self.response.url - - @filename.setter - def filename(self, value): - return - - def __getattr__(self, name): - # File operations are passed through the response. - # Warn for some commonly used ones - passthrough_warnings = { - 'read': 'response.read()', - # technically possibly due to passthrough, but we should discourage this - 'get_header': 'response.get_header()', - 'readable': 'response.readable()', - 'closed': 'response.closed', - 'tell': 'response.tell()', - } - if name in passthrough_warnings: - deprecation_warning(f'HTTPError.{name} is deprecated, use HTTPError.{passthrough_warnings[name]} instead') - return super().__getattr__(name) - - def __str__(self): - return str(self._http_error) - - def __repr__(self): - return repr(self._http_error) - - network_exceptions = (HTTPError, TransportError) From f24e44e8cbd88ce338d52f594a19330f64d38b50 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Sat, 20 Jan 2024 06:08:55 +0100 Subject: [PATCH 30/61] [webvtt] Don't parse single fragment files (#9034) Partially addresses #5804 Authored by: seproDev --- yt_dlp/downloader/hls.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index d4b3f03200..4ac5d99dc0 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -369,7 +369,10 @@ def fin_fragments(): return output.getvalue().encode() - self.download_and_append_fragments( - ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments) + if len(fragments) == 1: + self.download_and_append_fragments(ctx, fragments, info_dict) + else: + self.download_and_append_fragments( + ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments) else: return self.download_and_append_fragments(ctx, fragments, info_dict) From 35f4f764a786685ea45d84abe1cf1ad3847f4c97 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 21 Jan 2024 10:03:33 +1300 Subject: [PATCH 31/61] [rh:requests] Apply `remove_dot_segments` to absolute redirect locations Fixes https://github.com/yt-dlp/yt-dlp/issues/9020 Authored by: coletdjnz --- test/test_networking.py | 25 ++++++++++++++++--------- yt_dlp/networking/_requests.py | 5 +++++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 62325aa8e0..8cadd86f5a 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -180,6 +180,12 @@ def do_GET(self): self.send_header('Location', '/a/b/./../../headers') self.send_header('Content-Length', '0') self.end_headers() + elif self.path == '/redirect_dotsegments_absolute': + self.send_response(301) + # redirect to /headers but with dot segments before - absolute url + self.send_header('Location', f'http://127.0.0.1:{http_server_port(self.server)}/a/b/./../../headers') + self.send_header('Content-Length', '0') + self.end_headers() elif self.path.startswith('/redirect_'): self._redirect() elif self.path.startswith('/method'): @@ -345,16 +351,17 @@ def test_percent_encode(self, handler): res.close() @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) - def test_remove_dot_segments(self, handler): - with handler() as rh: + @pytest.mark.parametrize('path', [ + '/a/b/./../../headers', + '/redirect_dotsegments', + # https://github.com/yt-dlp/yt-dlp/issues/9020 + '/redirect_dotsegments_absolute', + ]) + def test_remove_dot_segments(self, handler, path): + with handler(verbose=True) as rh: # This isn't a comprehensive test, - # but it should be enough to check whether the handler is removing dot segments - res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/a/b/./../../headers')) - assert res.status == 200 - assert res.url == f'http://127.0.0.1:{self.http_port}/headers' - res.close() - - res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_dotsegments')) + # but it should be enough to check whether the handler is removing dot segments in required scenarios + res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}{path}')) assert res.status == 200 assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index e129110ca4..00e4bdb490 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -8,6 +8,7 @@ from ..dependencies import brotli, requests, urllib3 from ..utils import bug_reports_message, int_or_none, variadic +from ..utils.networking import normalize_url if requests is None: raise ImportError('requests module is not installed') @@ -199,6 +200,10 @@ def rebuild_method(self, prepared_request, response): prepared_request.method = new_method + # Requests fails to resolve dot segments on absolute redirect locations + # See: https://github.com/yt-dlp/yt-dlp/issues/9020 + prepared_request.url = normalize_url(prepared_request.url) + def rebuild_auth(self, prepared_request, response): # HACK: undo status code change from rebuild_method, if applicable. # rebuild_auth runs after requests would remove headers/body based on status code From fcaa2e735b00b15a2b0d9f55f4187c654b4b5b39 Mon Sep 17 00:00:00 2001 From: "lauren n. liberda" Date: Sun, 21 Jan 2024 03:22:26 +0100 Subject: [PATCH 32/61] [ie/Sejm,RedCDNLivx] Add extractors (#8676) Authored by: selfisekai --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/redge.py | 135 ++++++++++++++++++++ yt_dlp/extractor/sejmpl.py | 218 ++++++++++++++++++++++++++++++++ 3 files changed, 355 insertions(+) create mode 100644 yt_dlp/extractor/redge.py create mode 100644 yt_dlp/extractor/sejmpl.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3d5c3eb60f..31bef1eb5d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1593,6 +1593,7 @@ RedBullIE, ) from .reddit import RedditIE +from .redge import RedCDNLivxIE from .redgifs import ( RedGifsIE, RedGifsSearchIE, @@ -1727,6 +1728,7 @@ ) from .scrolller import ScrolllerIE from .seeker import SeekerIE +from .sejmpl import SejmIE from .senalcolombia import SenalColombiaLiveIE from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE diff --git a/yt_dlp/extractor/redge.py b/yt_dlp/extractor/redge.py new file mode 100644 index 0000000000..875d6f8aa5 --- /dev/null +++ b/yt_dlp/extractor/redge.py @@ -0,0 +1,135 @@ +import functools + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + float_or_none, + int_or_none, + join_nonempty, + parse_qs, + update_url_query, +) +from ..utils.traversal import traverse_obj + + +class RedCDNLivxIE(InfoExtractor): + _VALID_URL = r'https?://[^.]+\.(?:dcs\.redcdn|atmcdn)\.pl/(?:live(?:dash|hls|ss)|nvr)/o2/(?P[^/?#]+)/(?P[^?#]+)\.livx' + IE_NAME = 'redcdnlivx' + + _TESTS = [{ + 'url': 'https://r.dcs.redcdn.pl/livedash/o2/senat/ENC02/channel.livx?indexMode=true&startTime=638272860000&stopTime=638292544000', + 'info_dict': { + 'id': 'ENC02-638272860000-638292544000', + 'ext': 'mp4', + 'title': 'ENC02', + 'duration': 19683.982, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://r.dcs.redcdn.pl/livedash/o2/sejm/ENC18/live.livx?indexMode=true&startTime=722333096000&stopTime=722335562000', + 'info_dict': { + 'id': 'ENC18-722333096000-722335562000', + 'ext': 'mp4', + 'title': 'ENC18', + 'duration': 2463.995, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://r.dcs.redcdn.pl/livehls/o2/sportevolution/live/triathlon2018/warsaw.livx/playlist.m3u8?startTime=550305000000&stopTime=550327620000', + 'info_dict': { + 'id': 'triathlon2018-warsaw-550305000000-550327620000', + 'ext': 'mp4', + 'title': 'triathlon2018/warsaw', + 'duration': 22619.98, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://n-25-12.dcs.redcdn.pl/nvr/o2/sejm/Migacz-ENC01/1.livx?startTime=722347200000&stopTime=722367345000', + 'only_matching': True, + }, { + 'url': 'https://redir.atmcdn.pl/nvr/o2/sejm/ENC08/1.livx?startTime=503831270000&stopTime=503840040000', + 'only_matching': True, + }] + + """ + Known methods (first in url path): + - `livedash` - DASH MPD + - `livehls` - HTTP Live Streaming + - `livess` - IIS Smooth Streaming + - `nvr` - CCTV mode, directly returns a file, typically flv, avc1, aac + - `sc` - shoutcast/icecast (audio streams, like radio) + """ + + def _real_extract(self, url): + tenant, path = self._match_valid_url(url).group('tenant', 'id') + qs = parse_qs(url) + start_time = traverse_obj(qs, ('startTime', 0, {int_or_none})) + stop_time = traverse_obj(qs, ('stopTime', 0, {int_or_none})) + + def livx_mode(mode): + suffix = '' + if mode == 'livess': + suffix = '/manifest' + elif mode == 'livehls': + suffix = '/playlist.m3u8' + file_qs = {} + if start_time: + file_qs['startTime'] = start_time + if stop_time: + file_qs['stopTime'] = stop_time + if mode == 'nvr': + file_qs['nolimit'] = 1 + elif mode != 'sc': + file_qs['indexMode'] = 'true' + return update_url_query(f'https://r.dcs.redcdn.pl/{mode}/o2/{tenant}/{path}.livx{suffix}', file_qs) + + # no id or title for a transmission. making ones up. + title = path \ + .replace('/live', '').replace('live/', '') \ + .replace('/channel', '').replace('channel/', '') \ + .strip('/') + video_id = join_nonempty(title.replace('/', '-'), start_time, stop_time) + + formats = [] + # downloading the manifest separately here instead of _extract_ism_formats to also get some stream metadata + ism_res = self._download_xml_handle( + livx_mode('livess'), video_id, + note='Downloading ISM manifest', + errnote='Failed to download ISM manifest', + fatal=False) + ism_doc = None + if ism_res is not False: + ism_doc, ism_urlh = ism_res + formats, _ = self._parse_ism_formats_and_subtitles(ism_doc, ism_urlh.url, 'ss') + + nvr_urlh = self._request_webpage( + HEADRequest(livx_mode('nvr')), video_id, 'Follow flv file redirect', fatal=False, + expected_status=lambda _: True) + if nvr_urlh and nvr_urlh.status == 200: + formats.append({ + 'url': nvr_urlh.url, + 'ext': 'flv', + 'format_id': 'direct-0', + 'preference': -1, # might be slow + }) + formats.extend(self._extract_mpd_formats(livx_mode('livedash'), video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_m3u8_formats( + livx_mode('livehls'), video_id, m3u8_id='hls', ext='mp4', fatal=False)) + + time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000 + duration = traverse_obj( + ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None + + live_status = None + if traverse_obj(ism_doc, '@IsLive') == 'TRUE': + live_status = 'is_live' + elif duration: + live_status = 'was_live' + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': duration, + 'live_status': live_status, + } diff --git a/yt_dlp/extractor/sejmpl.py b/yt_dlp/extractor/sejmpl.py new file mode 100644 index 0000000000..29cb0152a2 --- /dev/null +++ b/yt_dlp/extractor/sejmpl.py @@ -0,0 +1,218 @@ +import datetime + +from .common import InfoExtractor +from .redge import RedCDNLivxIE +from ..utils import ( + clean_html, + join_nonempty, + js_to_json, + strip_or_none, + update_url_query, +) +from ..utils.traversal import traverse_obj + + +def is_dst(date): + last_march = datetime.datetime(date.year, 3, 31) + last_october = datetime.datetime(date.year, 10, 31) + last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7) + last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7) + return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3) + + +def rfc3339_to_atende(date): + date = datetime.datetime.fromisoformat(date) + date = date + datetime.timedelta(hours=1 if is_dst(date) else 0) + return int((date.timestamp() - 978307200) * 1000) + + +class SejmIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P[\dA-F]+)', + r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P[\dA-F]+)', + r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P\d+)\.nsf/VideoFrame\.xsp/(?P[\dA-F]+)', + ) + IE_NAME = 'sejm' + + _TESTS = [{ + # multiple cameras, polish SL iterpreter + 'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5', + 'info_dict': { + 'id': '6181EF1AD9CEEBB5C1258A6D006452B5', + 'title': '1. posiedzenie Sejmu X kadencji', + 'duration': 20145, + 'live_status': 'was_live', + 'location': 'Sala Posiedzeń', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ENC01-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC01', + 'live_status': 'was_live', + }, + }, { + 'info_dict': { + 'id': 'ENC30-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC30', + 'live_status': 'was_live', + }, + }, { + 'info_dict': { + 'id': 'ENC31-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC31', + 'live_status': 'was_live', + }, + }, { + 'info_dict': { + 'id': 'ENC32-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - ENC32', + 'live_status': 'was_live', + }, + }, { + # sign lang interpreter + 'info_dict': { + 'id': 'Migacz-ENC01-1-722340000000-722360145000', + 'ext': 'mp4', + 'duration': 20145, + 'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01', + 'live_status': 'was_live', + }, + }], + }, { + 'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2', + 'info_dict': { + 'id': '9377A9D65518E9A5C125808E002E9FF2', + 'title': 'Debata "Lepsza Polska: obywatelska"', + 'description': 'KP .Nowoczesna', + 'duration': 8770, + 'live_status': 'was_live', + 'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ENC08-1-503831270000-503840040000', + 'ext': 'mp4', + 'duration': 8770, + 'title': 'Debata "Lepsza Polska: obywatelska" - ENC08', + 'live_status': 'was_live', + }, + }], + }, { + # 7th term is very special, since it does not use redcdn livx + 'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F', + 'info_dict': { + 'id': 'A6E6D475ECCC6FE5C1257EF90034817F', + 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu', + 'description': 'SLD - Biuro Prasowe Klubu', + 'duration': 514, + 'location': 'sala 101/bud. C', + 'live_status': 'was_live', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'A6E6D475ECCC6FE5C1257EF90034817F', + 'ext': 'mp4', + 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu', + 'duration': 514, + }, + }], + }, { + 'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492', + 'only_matching': True, + }] + + def _real_extract(self, url): + term, video_id = self._match_valid_url(url).group('term', 'id') + frame = self._download_webpage( + f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}', + video_id) + # despite it says "transmisje_arch", it works for live streams too! + data = self._download_json( + f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}', + video_id) + params = data['params'] + + title = strip_or_none(data.get('title')) + + if data.get('status') == 'VIDEO_ENDED': + live_status = 'was_live' + elif data.get('status') == 'VIDEO_PLAYING': + live_status = 'is_live' + else: + live_status = None + self.report_warning(f'unknown status: {data.get("status")}') + + start_time = rfc3339_to_atende(params['start']) + # current streams have a stop time of *expected* end of session, but actual times + # can change during the transmission. setting a stop_time would artificially + # end the stream at that time, while the session actually keeps going. + if live_status == 'was_live': + stop_time = rfc3339_to_atende(params['stop']) + duration = (stop_time - start_time) // 1000 + else: + stop_time, duration = None, None + + entries = [] + + def add_entry(file, legacy_file=False): + if not file: + return + file = self._proto_relative_url(file) + if not legacy_file: + file = update_url_query(file, {'startTime': start_time}) + if stop_time is not None: + file = update_url_query(file, {'stopTime': stop_time}) + stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id') + common_info = { + 'url': file, + 'duration': duration, + } + if legacy_file: + entries.append({ + **common_info, + 'id': video_id, + 'title': title, + }) + else: + entries.append({ + **common_info, + '_type': 'url_transparent', + 'ie_key': RedCDNLivxIE.ie_key(), + 'id': stream_id, + 'title': join_nonempty(title, stream_id, delim=' - '), + }) + + cameras = self._search_json( + r'var\s+cameras\s*=', frame, 'camera list', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json, + fatal=False) or [] + for camera_file in traverse_obj(cameras, (..., 'file', {dict})): + if camera_file.get('flv'): + add_entry(camera_file['flv']) + elif camera_file.get('mp4'): + # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx + add_entry(camera_file['mp4'], legacy_file=True) + else: + self.report_warning('Unknown camera stream type found') + + if params.get('mig'): + add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False)) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': video_id, + 'title': title, + 'description': clean_html(data.get('desc')) or None, + 'duration': duration, + 'live_status': live_status, + 'location': strip_or_none(data.get('location')), + } From 5a63454b3637b3603434026cddfeac509218b90e Mon Sep 17 00:00:00 2001 From: Martin Renold Date: Sun, 21 Jan 2024 03:45:38 +0100 Subject: [PATCH 33/61] [ie/mx3] Add extractors (#8736) Authored by: martinxyz --- yt_dlp/extractor/_extractors.py | 5 + yt_dlp/extractor/mx3.py | 171 ++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 yt_dlp/extractor/mx3.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 31bef1eb5d..c4f1ccb8e4 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1137,6 +1137,11 @@ MusicdexArtistIE, MusicdexPlaylistIE, ) +from .mx3 import ( + Mx3IE, + Mx3NeoIE, + Mx3VolksmusikIE, +) from .mxplayer import ( MxplayerIE, MxplayerShowIE, diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py new file mode 100644 index 0000000000..cb9f50e0cf --- /dev/null +++ b/yt_dlp/extractor/mx3.py @@ -0,0 +1,171 @@ +import re + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + get_element_by_class, + int_or_none, + try_call, + url_or_none, + urlhandle_detect_ext, +) +from ..utils.traversal import traverse_obj + + +class Mx3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?%s/t/(?P\w+)' + _FORMATS = [{ + 'url': 'player_asset', + 'format_id': 'default', + 'quality': 0, + }, { + 'url': 'player_asset?quality=hd', + 'format_id': 'hd', + 'quality': 1, + }, { + 'url': 'download', + 'format_id': 'download', + 'quality': 2, + }, { + 'url': 'player_asset?quality=source', + 'format_id': 'source', + 'quality': 2, + }] + + def _extract_formats(self, track_id): + formats = [] + for fmt in self._FORMATS: + format_url = f'https://{self._DOMAIN}/tracks/{track_id}/{fmt["url"]}' + urlh = self._request_webpage( + HEADRequest(format_url), track_id, fatal=False, expected_status=404, + note=f'Checking for format {fmt["format_id"]}') + if urlh and urlh.status == 200: + formats.append({ + **fmt, + 'url': format_url, + 'ext': urlhandle_detect_ext(urlh), + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + return formats + + def _real_extract(self, url): + track_id = self._match_id(url) + webpage = self._download_webpage(url, track_id) + more_info = get_element_by_class('single-more-info', webpage) + data = self._download_json(f'https://{self._DOMAIN}/t/{track_id}.json', track_id, fatal=False) + + def get_info_field(name): + return self._html_search_regex( + rf']*>\s*{name}\s*\s*]*>(.*?)', + more_info, name, default=None, flags=re.DOTALL) + + return { + 'id': track_id, + 'formats': self._extract_formats(track_id), + 'genre': self._html_search_regex( + r']+class="single-band-genre"[^>]*>([^<]+)', webpage, 'genre', default=None), + 'release_year': int_or_none(get_info_field('Year of creation')), + 'description': get_info_field('Description'), + 'tags': try_call(lambda: get_info_field('Tag').split(', '), list), + **traverse_obj(data, { + 'title': ('title', {str}), + 'artist': (('performer_name', 'artist'), {str}), + 'album_artist': ('artist', {str}), + 'composer': ('composer_name', {str}), + 'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}), + }, get_all=False), + } + + +class Mx3IE(Mx3BaseIE): + _DOMAIN = 'mx3.ch' + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _TESTS = [{ + 'url': 'https://mx3.ch/t/1Cru', + 'md5': '7ba09e9826b4447d4e1ce9d69e0e295f', + 'info_dict': { + 'id': '1Cru', + 'ext': 'wav', + 'artist': 'Godina', + 'album_artist': 'Tortue Tortue', + 'composer': 'Olivier Godinat', + 'genre': 'Rock', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', + 'title': "S'envoler", + 'release_year': 2021, + 'tags': [], + } + }, { + 'url': 'https://mx3.ch/t/1LIY', + 'md5': '48293cb908342547827f963a5a2e9118', + 'info_dict': { + 'id': '1LIY', + 'ext': 'mov', + 'artist': 'Tania Kimfumu', + 'album_artist': 'The Broots', + 'composer': 'Emmanuel Diserens', + 'genre': 'Electro', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', + 'title': 'The Broots-Larytta remix "Begging For Help"', + 'release_year': 2023, + 'tags': ['the broots', 'cassata records', 'larytta'], + 'description': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023', + } + }, { + 'url': 'https://mx3.ch/t/1C6E', + 'md5': '1afcd578493ddb8e5008e94bb6d97e25', + 'info_dict': { + 'id': '1C6E', + 'ext': 'wav', + 'artist': 'Alien Bubblegum', + 'album_artist': 'Alien Bubblegum', + 'composer': 'Alien Bubblegum', + 'genre': 'Punk', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', + 'title': 'Wide Awake', + 'release_year': 2021, + 'tags': ['alien bubblegum', 'bubblegum', 'alien', 'pop punk', 'poppunk'], + } + }] + + +class Mx3NeoIE(Mx3BaseIE): + _DOMAIN = 'neo.mx3.ch' + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _TESTS = [{ + 'url': 'https://neo.mx3.ch/t/1hpd', + 'md5': '6d9986bbae5cac3296ec8813bf965eb2', + 'info_dict': { + 'id': '1hpd', + 'ext': 'wav', + 'artist': 'Baptiste Lopez', + 'album_artist': 'Kammerorchester Basel', + 'composer': 'Jannik Giger', + 'genre': 'Composition, Orchestra', + 'title': 'Troisième œil. Für Kammerorchester (2023)', + 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252', + 'release_year': 2023, + 'tags': [], + } + }] + + +class Mx3VolksmusikIE(Mx3BaseIE): + _DOMAIN = 'volksmusik.mx3.ch' + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _TESTS = [{ + 'url': 'https://volksmusik.mx3.ch/t/Zx', + 'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c', + 'info_dict': { + 'id': 'Zx', + 'ext': 'mp3', + 'artist': 'Ländlerkapelle GrischArt', + 'album_artist': 'Ländlerkapelle GrischArt', + 'composer': 'Urs Glauser', + 'genre': 'Instrumental, Graubünden', + 'title': 'Chämilouf', + 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', + 'release_year': 2012, + 'tags': [], + } + }] From 9f1e9dab21bbe651544c8f4663b0e615dc450e4d Mon Sep 17 00:00:00 2001 From: dasidiot <140998618+dasidiot@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:46:53 -0500 Subject: [PATCH 34/61] [ie/motherless] Support uploader playlists (#8994) Authored by: dasidiot --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/motherless.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c4f1ccb8e4..a273ae0d9c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1111,6 +1111,7 @@ MotherlessIE, MotherlessGroupIE, MotherlessGalleryIE, + MotherlessUploaderIE, ) from .motorsport import MotorsportIE from .moviepilot import MoviepilotIE diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index e359c44e93..160150a7b6 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -177,6 +177,7 @@ def _real_extract(self, url): class MotherlessPaginatedIE(InfoExtractor): + _EXTRA_QUERY = {} _PAGE_SIZE = 60 def _correct_path(self, url, item_id): @@ -199,7 +200,7 @@ def _real_extract(self, url): def get_page(idx): page = idx + 1 current_page = webpage if not idx else self._download_webpage( - real_url, item_id, note=f'Downloading page {page}', query={'page': page}) + real_url, item_id, note=f'Downloading page {page}', query={'page': page, **self._EXTRA_QUERY}) yield from self._extract_entries(current_page, real_url) return self.playlist_result( @@ -213,7 +214,7 @@ class MotherlessGroupIE(MotherlessPaginatedIE): 'url': 'http://motherless.com/gv/movie_scenes', 'info_dict': { 'id': 'movie_scenes', - 'title': 'Movie Scenes', + 'title': 'Movie Scenes - Videos - Hot and sexy scenes from "regular" movies... Beautiful actresses fully', }, 'playlist_mincount': 540, }, { @@ -244,7 +245,7 @@ class MotherlessGalleryIE(MotherlessPaginatedIE): 'id': '338999F', 'title': 'Random', }, - 'playlist_mincount': 190, + 'playlist_mincount': 171, }, { 'url': 'https://motherless.com/GVABD6213', 'info_dict': { @@ -270,3 +271,27 @@ class MotherlessGalleryIE(MotherlessPaginatedIE): def _correct_path(self, url, item_id): return urllib.parse.urljoin(url, f'/GV{item_id}') + + +class MotherlessUploaderIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/u/(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://motherless.com/u/Mrgo4hrs2023', + 'info_dict': { + 'id': 'Mrgo4hrs2023', + 'title': "Mrgo4hrs2023's Uploads - Videos", + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://motherless.com/u/Happy_couple?t=v', + 'info_dict': { + 'id': 'Happy_couple', + 'title': "Happy_couple's Uploads - Videos", + }, + 'playlist_mincount': 8, + }] + + _EXTRA_QUERY = {'t': 'v'} + + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/u/{item_id}?t=v') From 3e083191cdc34dd8c482da9a9b4bc682f824cb9d Mon Sep 17 00:00:00 2001 From: u-spec-png Date: Sun, 21 Jan 2024 19:50:14 +0100 Subject: [PATCH 35/61] [ie/Newgrounds:user] Fix extractor (#9046) Closes #7308 Authored by: u-spec-png --- yt_dlp/extractor/newgrounds.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 9e3286dfe7..9601cd10e7 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -3,15 +3,15 @@ from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, clean_html, extract_attributes, get_element_by_id, int_or_none, parse_count, parse_duration, + traverse_obj, unified_timestamp, - OnDemandPagedList, - try_get, ) @@ -263,19 +263,16 @@ class NewgroundsUserIE(InfoExtractor): def _fetch_page(self, channel_id, url, page): page += 1 posts_info = self._download_json( - f'{url}/page/{page}', channel_id, + f'{url}?page={page}', channel_id, note=f'Downloading page {page}', headers={ 'Accept': 'application/json, text/javascript, */*; q = 0.01', 'X-Requested-With': 'XMLHttpRequest', }) - sequence = posts_info.get('sequence', []) - for year in sequence: - posts = try_get(posts_info, lambda x: x['years'][str(year)]['items']) - for post in posts: - path, media_id = self._search_regex( - r']+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>', - post, 'url', group=(1, 2)) - yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id) + for post in traverse_obj(posts_info, ('items', ..., ..., {str})): + path, media_id = self._search_regex( + r']+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>', + post, 'url', group=(1, 2)) + yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id) def _real_extract(self, url): channel_id = self._match_id(url) From c0ecceeefe6ebd27452d9d8f20658f83ae121d04 Mon Sep 17 00:00:00 2001 From: gmes78 Date: Sun, 21 Jan 2024 18:56:01 +0000 Subject: [PATCH 36/61] [ie/Rule34Video] Fix `_VALID_URL` (#9044) Authored by: gmes78 --- yt_dlp/extractor/rule34video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index e6bb4258e9..85ad7e2ff2 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -18,10 +18,10 @@ class Rule34VideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos?/(?P\d+)' _TESTS = [ { - 'url': 'https://rule34video.com/videos/3065157/shot-it-mmd-hmv/', + 'url': 'https://rule34video.com/video/3065157/shot-it-mmd-hmv/', 'md5': 'ffccac2c23799dabbd192621ae4d04f3', 'info_dict': { 'id': '3065157', From c099ec9392b0283dde34b290d1a04158ad8eb882 Mon Sep 17 00:00:00 2001 From: Stefan Lobbenmeier Date: Sun, 21 Jan 2024 21:54:11 +0100 Subject: [PATCH 37/61] [ie/ard:mediathek] Support cookies to verify age (#9037) Closes #9035 Authored by: StefanLobbenmeier --- yt_dlp/extractor/ard.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index f4b1cd0756..46e68d61e2 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -8,6 +8,7 @@ determine_ext, int_or_none, join_nonempty, + jwt_decode_hs256, make_archive_id, parse_duration, parse_iso8601, @@ -238,6 +239,7 @@ class ARDBetaMediathekIE(InfoExtractor): (?P[a-zA-Z0-9]+) /?(?:[?#]|$)''' _GEO_COUNTRIES = ['DE'] + _TOKEN_URL = 'https://sso.ardmediathek.de/sso/token' _TESTS = [{ 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', @@ -359,12 +361,27 @@ def _extract_episode_info(self, title): def _real_extract(self, url): display_id = self._match_id(url) + query = {'embedded': 'false', 'mcV6': 'true'} + headers = {} + + if self._get_cookies(self._TOKEN_URL).get('ams'): + token = self._download_json( + self._TOKEN_URL, display_id, 'Fetching token for age verification', + 'Unable to fetch age verification token', fatal=False) + id_token = traverse_obj(token, ('idToken', {str})) + decoded_token = traverse_obj(id_token, ({jwt_decode_hs256}, {dict})) + user_id = traverse_obj(decoded_token, (('user_id', 'sub'), {str}), get_all=False) + if not user_id: + self.report_warning('Unable to extract token, continuing without authentication') + else: + headers['x-authorization'] = f'Bearer {id_token}' + query['userId'] = user_id + if decoded_token.get('age_rating') != 18: + self.report_warning('Account is not verified as 18+; video may be unavailable') page_data = self._download_json( - f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', display_id, query={ - 'embedded': 'false', - 'mcV6': 'true', - }) + f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', + display_id, query=query, headers=headers) # For user convenience we use the old contentId instead of the longer crid # Ref: https://github.com/yt-dlp/yt-dlp/issues/8731#issuecomment-1874398283 @@ -383,7 +400,7 @@ def _real_extract(self, url): media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict})) if player_data.get('blockedByFsk'): - self.raise_no_formats('This video is only available after 22:00', expected=True) + self.raise_login_required('This video is only available for age verified users or after 22:00') formats = [] subtitles = {} From f0e8bc7c60b61fe18b63116c975609d76b904771 Mon Sep 17 00:00:00 2001 From: John Victor <37747572+johnvictorfs@users.noreply.github.com> Date: Sun, 21 Jan 2024 19:36:59 -0300 Subject: [PATCH 38/61] [ie/patreon] Fix embedded HLS extraction (#8993) Closes #8973 Authored by: johnvictorfs --- yt_dlp/extractor/patreon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 9316789df2..d2ddb72cd4 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -275,7 +275,7 @@ def _real_extract(self, url): 'ext': ext, 'url': post_file['url'], } - elif name == 'video': + elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) return { **info, From 9cd90447907a59c8a2727583f4a755fb23ed8cd3 Mon Sep 17 00:00:00 2001 From: chtk Date: Mon, 22 Jan 2024 06:57:52 +0100 Subject: [PATCH 39/61] [ie/Floatplane] Improve metadata extraction (#8934) Authored by: chtk --- yt_dlp/extractor/floatplane.py | 103 +++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/floatplane.py b/yt_dlp/extractor/floatplane.py index 2cf4d4e648..8676d73f60 100644 --- a/yt_dlp/extractor/floatplane.py +++ b/yt_dlp/extractor/floatplane.py @@ -11,6 +11,7 @@ join_nonempty, parse_codecs, parse_iso8601, + url_or_none, urljoin, ) from ..utils.traversal import traverse_obj @@ -108,6 +109,64 @@ class FloatplaneIE(InfoExtractor): 'availability': 'subscriber_only', }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.floatplane.com/post/65B5PNoBtf', + 'info_dict': { + 'id': '65B5PNoBtf', + 'description': 'I recorded the inbuilt demo mode for your 90\'s enjoyment, thanks for being Floaties!', + 'display_id': '65B5PNoBtf', + 'like_count': int, + 'release_timestamp': 1701249480, + 'uploader': 'The Trash Network', + 'availability': 'subscriber_only', + 'uploader_id': '61bc20c9a131fb692bf2a513', + 'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home', + 'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing', + 'comment_count': int, + 'title': 'The $50 electronic drum kit.', + 'channel_id': '64424fe73cd58cbcf8d8e131', + 'thumbnail': 'https://pbs.floatplane.com/blogPost_thumbnails/65B5PNoBtf/725555379422705_1701247052743.jpeg', + 'dislike_count': int, + 'channel': 'The Drum Thing', + 'release_date': '20231129', + }, + 'playlist_count': 2, + 'playlist': [{ + 'info_dict': { + 'id': 'ISPJjexylS', + 'ext': 'mp4', + 'release_date': '20231129', + 'release_timestamp': 1701249480, + 'title': 'The $50 electronic drum kit. .mov', + 'channel_id': '64424fe73cd58cbcf8d8e131', + 'thumbnail': 'https://pbs.floatplane.com/video_thumbnails/ISPJjexylS/335202812134041_1701249383392.jpeg', + 'availability': 'subscriber_only', + 'uploader': 'The Trash Network', + 'duration': 622, + 'channel': 'The Drum Thing', + 'uploader_id': '61bc20c9a131fb692bf2a513', + 'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing', + 'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home', + }, + }, { + 'info_dict': { + 'id': 'qKfxu6fEpu', + 'ext': 'aac', + 'release_date': '20231129', + 'release_timestamp': 1701249480, + 'title': 'Roland TD-7 Demo.m4a', + 'channel_id': '64424fe73cd58cbcf8d8e131', + 'availability': 'subscriber_only', + 'uploader': 'The Trash Network', + 'duration': 114, + 'channel': 'The Drum Thing', + 'uploader_id': '61bc20c9a131fb692bf2a513', + 'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing', + 'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home', + }, + }], + 'skip': 'requires subscription: "The Trash Network"', + 'params': {'skip_download': 'm3u8'}, }] def _real_initialize(self): @@ -124,6 +183,22 @@ def _real_extract(self, url): if not any(traverse_obj(post_data, ('metadata', ('hasVideo', 'hasAudio')))): raise ExtractorError('Post does not contain a video or audio track', expected=True) + uploader_url = format_field( + post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None + + common_info = { + 'uploader_url': uploader_url, + 'channel_url': urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname'))), + 'availability': self._availability(needs_subscription=True), + **traverse_obj(post_data, { + 'uploader': ('creator', 'title', {str}), + 'uploader_id': ('creator', 'id', {str}), + 'channel': ('channel', 'title', {str}), + 'channel_id': ('channel', 'id', {str}), + 'release_timestamp': ('releaseDate', {parse_iso8601}), + }), + } + items = [] for media in traverse_obj(post_data, (('videoAttachments', 'audioAttachments'), ...)): media_id = media['id'] @@ -150,11 +225,11 @@ def format_path(params): formats = [] for quality in traverse_obj(stream, ('resource', 'data', 'qualityLevels', ...)): url = urljoin(stream['cdn'], format_path(traverse_obj( - stream, ('resource', 'data', 'qualityLevelParams', quality['name'])))) + stream, ('resource', 'data', 'qualityLevelParams', quality['name'], {dict})))) formats.append({ **traverse_obj(quality, { - 'format_id': 'name', - 'format_note': 'label', + 'format_id': ('name', {str}), + 'format_note': ('label', {str}), 'width': ('width', {int}), 'height': ('height', {int}), }), @@ -164,38 +239,28 @@ def format_path(params): }) items.append({ + **common_info, 'id': media_id, **traverse_obj(metadata, { - 'title': 'title', + 'title': ('title', {str}), 'duration': ('duration', {int_or_none}), - 'thumbnail': ('thumbnail', 'path'), + 'thumbnail': ('thumbnail', 'path', {url_or_none}), }), 'formats': formats, }) - uploader_url = format_field( - post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None - channel_url = urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname'))) - post_info = { + **common_info, 'id': post_id, 'display_id': post_id, **traverse_obj(post_data, { - 'title': 'title', + 'title': ('title', {str}), 'description': ('text', {clean_html}), - 'uploader': ('creator', 'title'), - 'uploader_id': ('creator', 'id'), - 'channel': ('channel', 'title'), - 'channel_id': ('channel', 'id'), 'like_count': ('likes', {int_or_none}), 'dislike_count': ('dislikes', {int_or_none}), 'comment_count': ('comments', {int_or_none}), - 'release_timestamp': ('releaseDate', {parse_iso8601}), - 'thumbnail': ('thumbnail', 'path'), + 'thumbnail': ('thumbnail', 'path', {url_or_none}), }), - 'uploader_url': uploader_url, - 'channel_url': channel_url, - 'availability': self._availability(needs_subscription=True), } if len(items) > 1: From a40b0070c2a00d3ed839897462171a82323aa875 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Mon, 22 Jan 2024 14:28:11 +0800 Subject: [PATCH 40/61] [ie/facebook:ads] Add extractor (#8870) Closes #8083 Authored by: kclauhk --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/facebook.py | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a273ae0d9c..f51045668b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -588,6 +588,7 @@ FacebookPluginsVideoIE, FacebookRedirectURLIE, FacebookReelIE, + FacebookAdsIE, ) from .fancode import ( FancodeVodIE, diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a16a067abb..26cfda5384 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -20,6 +20,7 @@ get_element_by_id, get_first, int_or_none, + join_nonempty, js_to_json, merge_dicts, parse_count, @@ -907,3 +908,114 @@ def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) + + +class FacebookAdsIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/ads/library/?\?(?:[^#]+&)?id=(?P\d+)' + IE_NAME = 'facebook:ads' + + _TESTS = [{ + 'url': 'https://www.facebook.com/ads/library/?id=899206155126718', + 'info_dict': { + 'id': '899206155126718', + 'ext': 'mp4', + 'title': 'video by Kandao', + 'uploader': 'Kandao', + 'uploader_id': '774114102743284', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1702548330, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20231214', + 'like_count': int, + } + }, { + 'url': 'https://www.facebook.com/ads/library/?id=893637265423481', + 'info_dict': { + 'id': '893637265423481', + 'title': 'Jusqu\u2019\u00e0 -25% sur une s\u00e9lection de vins p\u00e9tillants italiens ', + 'uploader': 'Eataly Paris Marais', + 'uploader_id': '2086668958314152', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1703571529, + 'upload_date': '20231226', + 'like_count': int, + }, + 'playlist_count': 3, + }, { + 'url': 'https://es-la.facebook.com/ads/library/?id=901230958115569', + 'only_matching': True, + }, { + 'url': 'https://m.facebook.com/ads/library/?id=901230958115569', + 'only_matching': True, + }] + + _FORMATS_MAP = { + 'watermarked_video_sd_url': ('sd-wmk', 'SD, watermarked'), + 'video_sd_url': ('sd', None), + 'watermarked_video_hd_url': ('hd-wmk', 'HD, watermarked'), + 'video_hd_url': ('hd', None), + } + + def _extract_formats(self, video_dict): + formats = [] + for format_key, format_url in traverse_obj(video_dict, ( + {dict.items}, lambda _, v: v[0] in self._FORMATS_MAP and url_or_none(v[1]) + )): + formats.append({ + 'format_id': self._FORMATS_MAP[format_key][0], + 'format_note': self._FORMATS_MAP[format_key][1], + 'url': format_url, + 'ext': 'mp4', + 'quality': qualities(tuple(self._FORMATS_MAP))(format_key), + }) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + post_data = [self._parse_json(j, video_id, fatal=False) + for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)] + data = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False) + if not data: + raise ExtractorError('Unable to extract ad data') + + title = data.get('title') + if not title or title == '{{product.name}}': + title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data) + + info_dict = traverse_obj(data, { + 'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}), + 'uploader': ('page_name', {str}), + 'uploader_id': ('page_id', {str_or_none}), + 'uploader_url': ('page_profile_uri', {url_or_none}), + 'timestamp': ('creation_time', {int_or_none}), + 'like_count': ('page_like_count', {int_or_none}), + }) + + entries = [] + for idx, entry in enumerate(traverse_obj( + data, (('videos', 'cards'), lambda _, v: any([url_or_none(v[f]) for f in self._FORMATS_MAP]))), 1 + ): + entries.append({ + 'id': f'{video_id}_{idx}', + 'title': entry.get('title') or title, + 'description': entry.get('link_description') or info_dict.get('description'), + 'thumbnail': url_or_none(entry.get('video_preview_image_url')), + 'formats': self._extract_formats(entry), + }) + + if len(entries) == 1: + info_dict.update(entries[0]) + + elif len(entries) > 1: + info_dict.update({ + 'title': entries[0]['title'], + 'entries': entries, + '_type': 'playlist', + }) + + info_dict['id'] = video_id + + return info_dict From 5f25f348f9eb5db842b1ec6799f95bebb7ba35a7 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Tue, 23 Jan 2024 23:20:13 +0100 Subject: [PATCH 41/61] [ie/pr0gramm] Enable POL filter and provide tags without login (#9051) Authored by: Grub4K --- yt_dlp/extractor/pr0gramm.py | 41 ++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 2a67942081..36e415f4a5 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -18,7 +18,6 @@ class Pr0grammIE(InfoExtractor): _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P[\d]+)(?:[/?#:]|$)' _TESTS = [{ - # Tags require account 'url': 'https://pr0gramm.com/new/video/5466437', 'info_dict': { 'id': '5466437', @@ -36,7 +35,6 @@ class Pr0grammIE(InfoExtractor): '_old_archive_ids': ['pr0grammstatic 5466437'], }, }, { - # Tags require account 'url': 'https://pr0gramm.com/new/3052805:comment28391322', 'info_dict': { 'id': '3052805', @@ -71,6 +69,23 @@ class Pr0grammIE(InfoExtractor): 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', '_old_archive_ids': ['pr0grammstatic 5848332'], }, + }, { + 'url': 'https://pr0gramm.com/top/5895149', + 'info_dict': { + 'id': '5895149', + 'ext': 'mp4', + 'title': 'pr0gramm-5895149 by algoholigSeeManThrower', + 'tags': 'count:19', + 'uploader': 'algoholigSeeManThrower', + 'uploader_id': 457556, + 'upload_timestamp': 1697580902, + 'upload_date': '20231018', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': 'https://thumb.pr0gramm.com/2023/10/18/db47bb3db5e1a1b3.jpg', + '_old_archive_ids': ['pr0grammstatic 5895149'], + }, }, { 'url': 'https://pr0gramm.com/static/5466437', 'only_matching': True, @@ -92,15 +107,15 @@ def _is_logged_in(self): def _maximum_flags(self): # We need to guess the flags for the content otherwise the api will raise an error # We can guess the maximum allowed flags for the account from the cookies - # Bitflags are (msbf): nsfp, nsfl, nsfw, sfw - flags = 0b0001 + # Bitflags are (msbf): pol, nsfp, nsfl, nsfw, sfw + flags = 0b10001 if self._is_logged_in: - flags |= 0b1000 + flags |= 0b01000 cookies = self._get_cookies(self.BASE_URL) if 'me' not in cookies: self._download_webpage(self.BASE_URL, None, 'Refreshing verification information') if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')): - flags |= 0b0110 + flags |= 0b00110 return flags @@ -134,14 +149,12 @@ def _real_extract(self, url): if not source or not source.endswith('mp4'): self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id) - tags = None - if self._is_logged_in: - metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags') - tags = traverse_obj(metadata, ('tags', ..., 'tag', {str})) - # Sorted by "confidence", higher confidence = earlier in list - confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) - if confidences: - tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] + metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags') + tags = traverse_obj(metadata, ('tags', ..., 'tag', {str})) + # Sorted by "confidence", higher confidence = earlier in list + confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) + if confidences: + tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] formats = traverse_obj(video_info, ('variants', ..., { 'format_id': ('name', {str}), From 5dda3b291f59f388f953337e9fb09a94b64aaf34 Mon Sep 17 00:00:00 2001 From: Caesim404 Date: Sun, 28 Jan 2024 04:02:09 +0200 Subject: [PATCH 42/61] [ie/lsm,cloudycdn] Add extractors (#8643) Closes #2978 Authored by: Caesim404 --- yt_dlp/extractor/_extractors.py | 6 + yt_dlp/extractor/cloudycdn.py | 79 +++++++++ yt_dlp/extractor/lsm.py | 282 ++++++++++++++++++++++++++++++++ 3 files changed, 367 insertions(+) create mode 100644 yt_dlp/extractor/cloudycdn.py create mode 100644 yt_dlp/extractor/lsm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f51045668b..09565055cf 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -369,6 +369,7 @@ from .cliprs import ClipRsIE from .closertotruth import CloserToTruthIE from .cloudflarestream import CloudflareStreamIE +from .cloudycdn import CloudyCDNIE from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE @@ -1001,6 +1002,11 @@ LRTVODIE, LRTStreamIE ) +from .lsm import ( + LSMLREmbedIE, + LSMLTVEmbedIE, + LSMReplayIE +) from .lumni import ( LumniIE ) diff --git a/yt_dlp/extractor/cloudycdn.py b/yt_dlp/extractor/cloudycdn.py new file mode 100644 index 0000000000..e6e470e073 --- /dev/null +++ b/yt_dlp/extractor/cloudycdn.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class CloudyCDNIE(InfoExtractor): + _VALID_URL = r'(?:https?:)?//embed\.cloudycdn\.services/(?P[^/?#]+)/media/(?P[\w-]+)' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://embed.cloudycdn.services/ltv/media/46k_d23-6000-105?', + 'md5': '64f72a360ca530d5ed89c77646c9eee5', + 'info_dict': { + 'id': '46k_d23-6000-105', + 'ext': 'mp4', + 'timestamp': 1700589151, + 'duration': 1442, + 'upload_date': '20231121', + 'title': 'D23-6000-105_cetstud', + 'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg', + } + }, { + 'url': 'https://embed.cloudycdn.services/izm/media/26e_lv-8-5-1', + 'md5': '798828a479151e2444d8dcfbec76e482', + 'info_dict': { + 'id': '26e_lv-8-5-1', + 'ext': 'mp4', + 'title': 'LV-8-5-1', + 'timestamp': 1669767167, + 'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/488306/placeholder1679423604.jpg', + 'duration': 1205, + 'upload_date': '20221130', + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.tavaklase.lv/video/es-esmu-mina-um-2/', + 'md5': '63074e8e6c84ac2a01f2fb8bf03b8f43', + 'info_dict': { + 'id': 'cqd_lib-2', + 'ext': 'mp4', + 'upload_date': '20230223', + 'duration': 629, + 'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/518407/placeholder1678748124.jpg', + 'timestamp': 1677181513, + 'title': 'LIB-2', + } + }] + + def _real_extract(self, url): + site_id, video_id = self._match_valid_url(url).group('site_id', 'id') + + data = self._download_json( + f'https://player.cloudycdn.services/player/{site_id}/media/{video_id}/', + video_id, data=urlencode_postdata({ + 'version': '6.4.0', + 'referer': url, + })) + + formats, subtitles = [], {} + for m3u8_url in traverse_obj(data, ('source', 'sources', ..., 'src', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('name', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'thumbnail': ('source', 'poster', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/lsm.py b/yt_dlp/extractor/lsm.py new file mode 100644 index 0000000000..35a831fa21 --- /dev/null +++ b/yt_dlp/extractor/lsm.py @@ -0,0 +1,282 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + js_to_json, + parse_iso8601, + parse_qs, + str_or_none, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class LSMLREmbedIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?: + (?:latvijasradio|lr1|lr2|klasika|lr4|naba|radioteatris)\.lsm| + pieci + )\.lv/[^/?#]+/(?: + pleijeris|embed + )/?\?(?:[^#]+&)?(?:show|id)=(?P\d+)''' + _TESTS = [{ + 'url': 'https://latvijasradio.lsm.lv/lv/embed/?theme=black&size=16x9&showCaptions=0&id=183522', + 'md5': '719b33875cd1429846eeeaeec6df2830', + 'info_dict': { + 'id': 'a342781', + 'ext': 'mp3', + 'duration': 1823, + 'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/gallery_fd4675ac.jpg', + } + }, { + 'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1270&theme=white&size=16x9', + 'info_dict': { + 'id': '1270', + }, + 'playlist_count': 3, + 'playlist': [{ + 'md5': '2e61b6eceff00d14d57fdbbe6ab24cac', + 'info_dict': { + 'id': 'a297397', + 'ext': 'mp3', + 'title': 'Eriks Emanuels Šmits "Pilāta evaņģēlijs". 1. daļa', + 'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f131ae81e3c.jpg', + 'duration': 3300, + }, + }], + }, { + 'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1269&theme=white&size=16x9', + 'md5': '24810d4a961da2295d9860afdcaf4f5a', + 'info_dict': { + 'id': 'a230690', + 'ext': 'mp3', + 'title': 'Jens Ahlboms "Spārni". Radioizrāde ar Mārtiņa Freimaņa mūziku', + 'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f13023a457c.jpg', + 'duration': 1788, + } + }, { + 'url': 'https://lr1.lsm.lv/lv/embed/?id=166557&show=0&theme=white&size=16x9', + 'info_dict': { + 'id': '166557', + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': '6a8b0927572f443f09c6e50a3ad65f2d', + 'info_dict': { + 'id': 'a303104', + 'ext': 'mp3', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg', + 'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits', + 'duration': 3222, + }, + }, { + 'md5': '5d5e191e718b7644e5118b7b4e093a6d', + 'info_dict': { + 'id': 'v303104', + 'ext': 'mp4', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg', + 'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits - Video Version', + 'duration': 3222, + }, + }], + }, { + 'url': 'https://lr1.lsm.lv/lv/embed/?id=183522&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://lr2.lsm.lv/lv/embed/?id=182126&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://klasika.lsm.lv/lv/embed/?id=110806&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://lr4.lsm.lv/lv/embed/?id=184282&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://pieci.lv/lv/embed/?id=168896&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://naba.lsm.lv/lv/embed/?id=182901&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://radioteatris.lsm.lv/lv/embed/?id=176439&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://lr1.lsm.lv/lv/pleijeris/?embed=0&id=48205&time=00%3A00&idx=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + query = parse_qs(url) + video_id = traverse_obj(query, ( + ('show', 'id'), 0, {int_or_none}, {lambda x: x or None}, {str_or_none}), get_all=False) + webpage = self._download_webpage(url, video_id) + + player_data, media_data = self._search_regex( + r'LR\.audio\.Player\s*\([^{]*(?P\{.*?\}),(?P\{.*\})\);', + webpage, 'player json', group=('player', 'media')) + + player_json = self._parse_json( + player_data, video_id, transform_source=js_to_json, fatal=False) or {} + media_json = self._parse_json(media_data, video_id, transform_source=js_to_json) + + entries = [] + for item in traverse_obj(media_json, (('audio', 'video'), lambda _, v: v['id'])): + formats = [] + for source_url in traverse_obj(item, ('sources', ..., 'file', {url_or_none})): + if determine_ext(source_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(source_url, video_id, fatal=False)) + else: + formats.append({'url': source_url}) + + id_ = item['id'] + title = item.get('title') + if id_.startswith('v') and not title: + title = traverse_obj( + media_json, ('audio', lambda _, v: v['id'][1:] == id_[1:], 'title', + {lambda x: x and f'{x} - Video Version'}), get_all=False) + + entries.append({ + 'formats': formats, + 'thumbnail': urljoin(url, player_json.get('poster')), + 'id': id_, + 'title': title, + 'duration': traverse_obj(item, ('duration', {int_or_none})), + }) + + if len(entries) == 1: + return entries[0] + + return self.playlist_result(entries, video_id) + + +class LSMLTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://ltv\.lsm\.lv/embed\?(?:[^#]+&)?c=(?P[^#&]+)' + _TESTS = [{ + 'url': 'https://ltv.lsm.lv/embed?c=eyJpdiI6IjQzbHVUeHAyaDJiamFjcjdSUUFKdnc9PSIsInZhbHVlIjoiMHl3SnJNRmd2TmFIdnZwOGtGUUpzODFzUEZ4SVVsN2xoRjliSW9vckUyMWZIWG8vbWVzaFFkY0lhNmRjbjRpaCIsIm1hYyI6ImMzNjdhMzFhNTFhZmY1ZmE0NWI5YmFjZGI1YmJiNGEyNjgzNDM4MjUzMWEwM2FmMDMyZDMwYWM1MDFjZmM5MGIiLCJ0YWciOiIifQ==', + 'md5': '64f72a360ca530d5ed89c77646c9eee5', + 'info_dict': { + 'id': '46k_d23-6000-105', + 'ext': 'mp4', + 'timestamp': 1700589151, + 'duration': 1442, + 'upload_date': '20231121', + 'title': 'D23-6000-105_cetstud', + 'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg', + } + }, { + 'url': 'https://ltv.lsm.lv/embed?enablesdkjs=1&c=eyJpdiI6IncwVzZmUFk2MU12enVWK1I3SUcwQ1E9PSIsInZhbHVlIjoid3FhV29vamc3T2sxL1RaRmJ5Rm1GTXozU0o2dVczdUtLK0cwZEZJMDQ2a3ZIRG5DK2pneGlnbktBQy9uazVleHN6VXhxdWIweWNvcHRDSnlISlNYOHlVZ1lpcTUrcWZSTUZPQW14TVdkMW9aOUtRWVNDcFF4eWpHNGcrT0VZbUNFQStKQk91cGpndW9FVjJIa0lpbkh3PT0iLCJtYWMiOiIyZGI1NDJlMWRlM2QyMGNhOGEwYTM2MmNlN2JlOGRhY2QyYjdkMmEzN2RlOTEzYTVkNzI1ODlhZDlhZjU4MjQ2IiwidGFnIjoiIn0=', + 'md5': 'a1711e190fe680fdb68fd8413b378e87', + 'info_dict': { + 'id': 'wUnFArIPDSY', + 'ext': 'mp4', + 'uploader': 'LTV_16plus', + 'release_date': '20220514', + 'channel_url': 'https://www.youtube.com/channel/UCNMrnafwXD2XKeeQOyfkFCw', + 'view_count': int, + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/wUnFArIPDSY/maxresdefault.jpg', + 'release_timestamp': 1652544074, + 'title': 'EIROVĪZIJA SALĀTOS', + 'live_status': 'was_live', + 'uploader_id': '@LTV16plus', + 'comment_count': int, + 'channel_id': 'UCNMrnafwXD2XKeeQOyfkFCw', + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'duration': 5269, + 'upload_date': '20220514', + 'age_limit': 0, + 'channel': 'LTV_16plus', + 'playable_in_embed': True, + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@LTV16plus', + 'like_count': int, + 'description': 'md5:7ff0c42ba971e3c13e4b8a2ff03b70b5', + } + }] + + def _real_extract(self, url): + video_id = urllib.parse.unquote(self._match_id(url)) + webpage = self._download_webpage(url, video_id) + data = self._search_json( + r'window\.ltvEmbedPayload\s*=', webpage, 'embed json', video_id) + embed_type = traverse_obj(data, ('source', 'name', {str})) + + if embed_type == 'telia': + ie_key = 'CloudyCDN' + embed_url = traverse_obj(data, ('source', 'embed_url', {url_or_none})) + elif embed_type == 'youtube': + ie_key = 'Youtube' + embed_url = traverse_obj(data, ('source', 'id', {str})) + else: + raise ExtractorError(f'Unsupported embed type {embed_type!r}') + + return self.url_result( + embed_url, ie_key, video_id, **traverse_obj(data, { + 'title': ('parentInfo', 'title'), + 'duration': ('parentInfo', 'duration', {int_or_none}), + 'thumbnail': ('source', 'poster', {url_or_none}), + })) + + +class LSMReplayIE(InfoExtractor): + _VALID_URL = r'https?://replay\.lsm\.lv/[^/?#]+/(?:ieraksts|statja)/[^/?#]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://replay.lsm.lv/lv/ieraksts/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija', + 'md5': '64f72a360ca530d5ed89c77646c9eee5', + 'info_dict': { + 'id': '46k_d23-6000-105', + 'ext': 'mp4', + 'timestamp': 1700586300, + 'description': 'md5:0f1b14798cc39e1ae578bd0eb268f759', + 'duration': 1442, + 'upload_date': '20231121', + 'title': '4. studija. Zolitūdes traģēdija un Inčupes stacija', + 'thumbnail': 'https://ltv.lsm.lv/storage/media/8/7/large/5/1f9604e1.jpg', + } + }, { + 'url': 'https://replay.lsm.lv/lv/ieraksts/lr/183522/138-nepilniga-kompensejamo-zalu-sistema-pat-menesiem-dzena-pacientus-pa-aptiekam', + 'md5': '719b33875cd1429846eeeaeec6df2830', + 'info_dict': { + 'id': 'a342781', + 'ext': 'mp3', + 'duration': 1823, + 'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/large_fd4675ac.jpg', + 'upload_date': '20231102', + 'timestamp': 1698921060, + 'description': 'md5:7bac3b2dd41e44325032943251c357b1', + } + }, { + 'url': 'https://replay.lsm.lv/ru/statja/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija', + 'only_matching': True, + }] + + def _fix_nuxt_data(self, webpage): + return re.sub(r'Object\.create\(null(?:,(\{.+\}))?\)', lambda m: m.group(1) or 'null', webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data = self._search_nuxt_data( + self._fix_nuxt_data(webpage), video_id, context_name='__REPLAY__') + + return { + '_type': 'url_transparent', + 'id': video_id, + **traverse_obj(data, { + 'url': ('playback', 'service', 'url', {url_or_none}), + 'title': ('mediaItem', 'title'), + 'description': ('mediaItem', ('lead', 'body')), + 'duration': ('mediaItem', 'duration', {int_or_none}), + 'timestamp': ('mediaItem', 'aired_at', {parse_iso8601}), + 'thumbnail': ('mediaItem', 'largeThumbnail', {url_or_none}), + }, get_all=False), + } From d79c7e9937c388c68b722ab7450960e43ef776d6 Mon Sep 17 00:00:00 2001 From: shmohawk Date: Sun, 28 Jan 2024 03:10:20 +0100 Subject: [PATCH 43/61] [ie/Txxx] Extract thumbnails (#9063) Authored by: shmohawk --- yt_dlp/extractor/txxx.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/yt_dlp/extractor/txxx.py b/yt_dlp/extractor/txxx.py index fff7a5d76c..77dabbc828 100644 --- a/yt_dlp/extractor/txxx.py +++ b/yt_dlp/extractor/txxx.py @@ -10,6 +10,7 @@ parse_duration, traverse_obj, try_call, + url_or_none, urljoin, variadic, ) @@ -83,6 +84,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.txxx.tube/contents/videos_sources/16574000/16574965/screenshots/1.jpg', } }, { 'url': 'https://txxx.tube/videos/16574965/digital-desire-malena-morgan/', @@ -98,6 +100,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.txxx.tube/contents/videos_sources/16574000/16574965/screenshots/1.jpg', } }, { 'url': 'https://vxxx.com/video-68925/', @@ -113,6 +116,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.vxxx.com/contents/videos_sources/68000/68925/screenshots/1.jpg', } }, { 'url': 'https://hclips.com/videos/6291073/malena-morgan-masturbates-her-sweet/', @@ -128,6 +132,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/6291000/6291073/screenshots/1.jpg', } }, { 'url': 'https://hdzog.com/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/', @@ -143,6 +148,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.hdzog.com/contents/videos_sources/67000/67063/screenshots/1.jpg', } }, { 'url': 'https://hdzog.tube/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/', @@ -158,6 +164,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.hdzog.com/contents/videos_sources/67000/67063/screenshots/1.jpg', } }, { 'url': 'https://hotmovs.com/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/', @@ -173,6 +180,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.hotmovs.com/contents/videos_sources/8789000/8789287/screenshots/10.jpg', } }, { 'url': 'https://hotmovs.tube/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/', @@ -188,6 +196,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.hotmovs.com/contents/videos_sources/8789000/8789287/screenshots/10.jpg', } }, { 'url': 'https://inporn.com/video/517897/malena-morgan-solo/', @@ -203,6 +212,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://iptn.m3pd.com/media/tn/sources/517897_1.jpg', } }, { 'url': 'https://privatehomeclips.com/videos/3630599/malena-morgan-cam-show/', @@ -218,6 +228,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/3630000/3630599/screenshots/15.jpg', } }, { 'url': 'https://tubepornclassic.com/videos/1015455/mimi-rogers-full-body-massage-nude-compilation/', @@ -233,6 +244,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.tubepornclassic.com/contents/videos_sources/1015000/1015455/screenshots/6.jpg', } }, { 'url': 'https://upornia.com/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/', @@ -248,6 +260,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.upornia.com/contents/videos_sources/1498000/1498858/screenshots/1.jpg', } }, { 'url': 'https://upornia.tube/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/', @@ -263,6 +276,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.upornia.com/contents/videos_sources/1498000/1498858/screenshots/1.jpg', } }, { 'url': 'https://vjav.com/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/', @@ -278,6 +292,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.vjav.com/contents/videos_sources/11000/11761/screenshots/23.jpg', } }, { 'url': 'https://vjav.tube/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/', @@ -293,6 +308,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.vjav.com/contents/videos_sources/11000/11761/screenshots/23.jpg', } }, { 'url': 'https://voyeurhit.com/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/', @@ -308,6 +324,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.voyeurhit.com/contents/videos_sources/332000/332875/screenshots/1.jpg', } }, { 'url': 'https://voyeurhit.tube/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/', @@ -323,6 +340,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://tn.voyeurhit.com/contents/videos_sources/332000/332875/screenshots/1.jpg', } }] _WEBPAGE_TESTS = [{ @@ -338,6 +356,7 @@ class TxxxIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'age_limit': 18, + 'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/5119000/5119660/screenshots/1.jpg', } }] @@ -371,6 +390,7 @@ def _real_extract(self, url): 'like_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'likes'))), 'dislike_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'dislikes'))), 'age_limit': 18, + 'thumbnail': traverse_obj(video_info, ('video', 'thumbsrc', {url_or_none})), 'formats': get_formats(host, video_file), } From 77c2472ca1ef9050a66aa68bc5fa1bee88706c66 Mon Sep 17 00:00:00 2001 From: jazz1611 Date: Sun, 28 Jan 2024 09:12:40 +0700 Subject: [PATCH 44/61] [ie/Gofile] Fix extraction (#9074) Closes #9073 Authored by: jazz1611 --- yt_dlp/extractor/gofile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py index ef14b57d08..eb1dcf85f5 100644 --- a/yt_dlp/extractor/gofile.py +++ b/yt_dlp/extractor/gofile.py @@ -66,7 +66,7 @@ def _entries(self, file_id): query_params = { 'contentId': file_id, 'token': self._TOKEN, - 'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js + 'wt': '4fd6sg89d7s6', # From https://gofile.io/dist/js/alljs.js } password = self.get_param('videopassword') if password: From c91d8b1899403daff6fc15206ad32de8db17fb8f Mon Sep 17 00:00:00 2001 From: jazz1611 Date: Sun, 28 Jan 2024 09:15:29 +0700 Subject: [PATCH 45/61] [ie/redtube] Fix formats extraction (#9076) Authored by: jazz1611 --- yt_dlp/extractor/redtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py index 172c31b396..36d530dafc 100644 --- a/yt_dlp/extractor/redtube.py +++ b/yt_dlp/extractor/redtube.py @@ -7,6 +7,7 @@ str_to_int, unified_strdate, url_or_none, + urljoin, ) @@ -79,7 +80,7 @@ def _real_extract(self, url): 'media definitions', default='{}'), video_id, fatal=False) for media in medias if isinstance(medias, list) else []: - format_url = url_or_none(media.get('videoUrl')) + format_url = urljoin('https://www.redtube.com', media.get('videoUrl')) if not format_url: continue format_id = media.get('format') From cae6e461073fb7c32fd32052a3e6721447c469bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=BCndig?= Date: Sun, 28 Jan 2024 03:19:54 +0100 Subject: [PATCH 46/61] [ie/PlaySuisse] Add login support (#9077) Closes #7974 Authored by: chkuendig --- yt_dlp/extractor/playsuisse.py | 53 ++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index 76288c7789..7c5cad1be6 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -1,10 +1,18 @@ import json from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + traverse_obj, + update_url_query, + urlencode_postdata, +) class PlaySuisseIE(InfoExtractor): + _NETRC_MACHINE = 'playsuisse' _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P[0-9]+)' _TESTS = [ { @@ -134,12 +142,47 @@ class PlaySuisseIE(InfoExtractor): id url }''' + _LOGIN_BASE_URL = 'https://login.srgssr.ch/srgssrlogin.onmicrosoft.com' + _LOGIN_PATH = 'B2C_1A__SignInV2' + _ID_TOKEN = None + + def _perform_login(self, username, password): + login_page = self._download_webpage( + 'https://www.playsuisse.ch/api/sso/login', None, note='Downloading login page', + query={'x': 'x', 'locale': 'de', 'redirectUrl': 'https://www.playsuisse.ch/'}) + settings = self._search_json(r'var\s+SETTINGS\s*=', login_page, 'settings', None) + + csrf_token = settings['csrf'] + query = {'tx': settings['transId'], 'p': self._LOGIN_PATH} + + status = traverse_obj(self._download_json( + f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/SelfAsserted', None, 'Logging in', + query=query, headers={'X-CSRF-TOKEN': csrf_token}, data=urlencode_postdata({ + 'request_type': 'RESPONSE', + 'signInName': username, + 'password': password + }), expected_status=400), ('status', {int_or_none})) + if status == 400: + raise ExtractorError('Invalid username or password', expected=True) + + urlh = self._request_webpage( + f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/api/CombinedSigninAndSignup/confirmed', + None, 'Downloading ID token', query={ + 'rememberMe': 'false', + 'csrf_token': csrf_token, + **query, + 'diags': '', + }) + + self._ID_TOKEN = traverse_obj(parse_qs(urlh.url), ('id_token', 0)) + if not self._ID_TOKEN: + raise ExtractorError('Login failed') def _get_media_data(self, media_id): # NOTE In the web app, the "locale" header is used to switch between languages, # However this doesn't seem to take effect when passing the header here. response = self._download_json( - 'https://4bbepzm4ef.execute-api.eu-central-1.amazonaws.com/prod/graphql', + 'https://www.playsuisse.ch/api/graphql', media_id, data=json.dumps({ 'operationName': 'AssetWatch', 'query': self._GRAPHQL_QUERY, @@ -150,6 +193,9 @@ def _get_media_data(self, media_id): return response['data']['assetV2'] def _real_extract(self, url): + if not self._ID_TOKEN: + self.raise_login_required(method='password') + media_id = self._match_id(url) media_data = self._get_media_data(media_id) info = self._extract_single(media_data) @@ -168,7 +214,8 @@ def _extract_single(self, media_data): if not media.get('url') or media.get('type') != 'HLS': continue f, subs = self._extract_m3u8_formats_and_subtitles( - media['url'], media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) + update_url_query(media['url'], {'id_token': self._ID_TOKEN}), + media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) formats.extend(f) self._merge_subtitles(subs, target=subtitles) From 0023af81fbce01984f35b34ecaf8562739831227 Mon Sep 17 00:00:00 2001 From: vista-narvas Date: Sun, 28 Jan 2024 16:32:19 +0100 Subject: [PATCH 47/61] [ie/RumbleChannel] Fix extractor (#9092) Closes #8782 Authored by: vista-narvas, Pranaxcau --- yt_dlp/extractor/rumble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 85567d9a22..1dc049ac8f 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -383,7 +383,7 @@ def entries(self, url, playlist_id): if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise - for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage): + for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage): yield self.url_result('https://rumble.com' + video_url) def _real_extract(self, url): From 9526b1f179d19f75284eceaa5e0ee381af18cf19 Mon Sep 17 00:00:00 2001 From: Christopher Schreiner Date: Sun, 28 Jan 2024 17:03:19 +0100 Subject: [PATCH 48/61] [ie/adn] Improve auth error handling (#9068) Closes #9067 Authored by: infanf --- yt_dlp/extractor/adn.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index ed23226a35..898d372980 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -3,6 +3,7 @@ import json import os import random +import time from .common import InfoExtractor from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 @@ -17,6 +18,7 @@ int_or_none, intlist_to_bytes, long_to_bytes, + parse_iso8601, pkcs1pad, strip_or_none, str_or_none, @@ -185,7 +187,10 @@ def _real_extract(self, url): user = options['user'] if not user.get('hasAccess'): - self.raise_login_required() + start_date = traverse_obj(options, ('video', 'startDate', {str})) + if (parse_iso8601(start_date) or 0) > time.time(): + raise ExtractorError(f'This video is not available yet. Release date: {start_date}', expected=True) + self.raise_login_required('This video requires a subscription', method='password') token = self._download_json( user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), @@ -267,6 +272,9 @@ def _real_extract(self, url): f['language'] = 'de' formats.extend(m3u8_formats) + if not formats: + self.raise_login_required('This video requires a subscription', method='password') + video = (self._download_json( self._API_BASE_URL + 'video/%s' % video_id, video_id, 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} From 5b68c478fb0b93ea6b8fac23f50e12217fa063db Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Mon, 29 Jan 2024 02:39:14 +0800 Subject: [PATCH 49/61] [ie/facebook] Set format HTTP chunk size (#9058) Closes #8197 Authored by: bashonly, kclauhk --- yt_dlp/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 26cfda5384..84856abe1b 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -564,7 +564,11 @@ def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around # with non-browser User-Agent. for f in info['formats']: + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + # Formats larger than ~500MB will return error 403 unless chunk size is regulated + f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 def extract_relay_data(_filter): return self._parse_json(self._search_regex( From 3c4d3ee491b0ec22ed3cade51d943d3d27141ba7 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Mon, 29 Jan 2024 02:41:56 +0800 Subject: [PATCH 50/61] [ie/facebook] Improve thumbnail extraction (#9060) Authored by: kclauhk --- yt_dlp/extractor/facebook.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 84856abe1b..2fbdf1c37c 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -682,6 +682,9 @@ def parse_attachment(attachment, key='media'): # honor precise duration in video info if video_info.get('duration'): webpage_info['duration'] = video_info['duration'] + # preserve preferred_thumbnail in video info + if video_info.get('thumbnail'): + webpage_info['thumbnail'] = video_info['thumbnail'] return merge_dicts(webpage_info, video_info) if not video_data: From 87286e93af949c4e6a0f8ba34af6a1ab5aa102b6 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Mon, 29 Jan 2024 02:50:03 +0800 Subject: [PATCH 51/61] [ie/facebook] Support permalink URLs (#9061) Authored by: kclauhk --- yt_dlp/extractor/facebook.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 2fbdf1c37c..d186b57bf5 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -44,6 +44,7 @@ class FacebookIE(InfoExtractor): (?:[^#]*?\#!/)? (?: (?: + permalink\.php| video/video\.php| photo\.php| video\.php| @@ -249,6 +250,7 @@ class FacebookIE(InfoExtractor): 'duration': 148.435, }, }, { + # data.node.comet_sections.content.story.attachments[].styles.attachment.media 'url': 'https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl', 'info_dict': { 'id': '6968553779868435', @@ -263,6 +265,22 @@ class FacebookIE(InfoExtractor): 'thumbnail': r're:^https?://.*', 'timestamp': 1701975646, }, + }, { + # data.node.comet_sections.content.story.attachments[].styles.attachment.media + 'url': 'https://www.facebook.com/permalink.php?story_fbid=pfbid0fqQuVEQyXRa9Dp4RcaTR14KHU3uULHV1EK7eckNXSH63JMuoALsAvVCJ97zAGitil&id=100068861234290', + 'info_dict': { + 'id': '270103405756416', + 'ext': 'mp4', + 'title': 'Lela Evans', + 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', + 'thumbnail': r're:^https?://.*', + 'uploader': 'Lela Evans', + 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', + 'upload_date': '20231228', + 'timestamp': 1703804085, + 'duration': 394.347, + 'view_count': int, + }, }, { 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', 'only_matching': True, From a514cc2feb1c3b265b19acab11487acad8bb3ab0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Sun, 28 Jan 2024 20:58:34 +0200 Subject: [PATCH 52/61] [ie/ERRJupiter] Add extractor (#8549) Authored by: glensc --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/err.py | 199 ++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 yt_dlp/extractor/err.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 09565055cf..2fc1e116ba 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -565,6 +565,7 @@ EroProfileIE, EroProfileAlbumIE, ) +from .err import ERRJupiterIE from .ertgr import ( ERTFlixCodenameIE, ERTFlixIE, diff --git a/yt_dlp/extractor/err.py b/yt_dlp/extractor/err.py new file mode 100644 index 0000000000..129f39ad6a --- /dev/null +++ b/yt_dlp/extractor/err.py @@ -0,0 +1,199 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class ERRJupiterIE(InfoExtractor): + _VALID_URL = r'https?://jupiter(?:pluss)?\.err\.ee/(?P\d+)' + _TESTS = [{ + 'note': 'Jupiter: Movie: siin-me-oleme', + 'url': 'https://jupiter.err.ee/1211107/siin-me-oleme', + 'md5': '9b45d1682a98853acaa1e1b0c791f425', + 'info_dict': { + 'id': '1211107', + 'ext': 'mp4', + 'title': 'Siin me oleme!', + 'alt_title': '', + 'description': 'md5:1825b795f5f7584241aeb59e5bbb4f70', + 'release_date': '20231226', + 'upload_date': '20201217', + 'modified_date': '20201217', + 'release_timestamp': 1703577600, + 'timestamp': 1608210000, + 'modified_timestamp': 1608220800, + 'release_year': 1978, + }, + }, { + 'note': 'Jupiter: Series: Impulss', + 'url': 'https://jupiter.err.ee/1609145945/impulss', + 'md5': 'a378486df07ed1ba74e46cc861886243', + 'info_dict': { + 'id': '1609145945', + 'ext': 'mp4', + 'title': 'Impulss', + 'alt_title': 'Loteriipilet hooldekodusse', + 'description': 'md5:fa8a2ed0cdccb130211513443ee4d571', + 'release_date': '20231107', + 'upload_date': '20231026', + 'modified_date': '20231118', + 'release_timestamp': 1699380000, + 'timestamp': 1698327601, + 'modified_timestamp': 1700311802, + 'series': 'Impulss', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Loteriipilet hooldekodusse', + 'episode_number': 6, + 'series_id': '1609108187', + 'release_year': 2023, + 'episode_id': '1609145945', + }, + }, { + 'note': 'Jupiter: Radio Show: mnemoturniir episode', + 'url': 'https://jupiter.err.ee/1037919/mnemoturniir', + 'md5': 'f1eb95fe66f9620ff84e81bbac37076a', + 'info_dict': { + 'id': '1037919', + 'ext': 'm4a', + 'title': 'Mnemoturniir', + 'alt_title': '', + 'description': 'md5:626db52394e7583c26ab74d6a34d9982', + 'release_date': '20240121', + 'upload_date': '20240108', + 'modified_date': '20240121', + 'release_timestamp': 1705827900, + 'timestamp': 1704675602, + 'modified_timestamp': 1705827601, + 'series': 'Mnemoturniir', + 'season': 'Season 0', + 'season_number': 0, + 'episode': 'Episode 0', + 'episode_number': 0, + 'series_id': '1037919', + 'release_year': 2024, + 'episode_id': '1609215101', + }, + }, { + 'note': 'Jupiter+: Clip: bolee-zelenyj-tallinn', + 'url': 'https://jupiterpluss.err.ee/1609180445/bolee-zelenyj-tallinn', + 'md5': '1b812270c4daf6ce51c06bfeaf33ed95', + 'info_dict': { + 'id': '1609180445', + 'ext': 'mp4', + 'title': 'Более зеленый Таллинн', + 'alt_title': '', + 'description': 'md5:fd34d9bf939c28c4a725b19a7f0d6320', + 'release_date': '20231224', + 'upload_date': '20231130', + 'modified_date': '20231207', + 'release_timestamp': 1703423400, + 'timestamp': 1701338400, + 'modified_timestamp': 1701967200, + 'release_year': 2023, + }, + }, { + 'note': 'Jupiter+: Series: The Sniffer', + 'url': 'https://jupiterpluss.err.ee/1608311387/njuhach', + 'md5': '2abdeb7131ce551bce49e8d0cea08536', + 'info_dict': { + 'id': '1608311387', + 'ext': 'mp4', + 'title': 'Нюхач', + 'alt_title': '', + 'description': 'md5:8c5c7d8f32ec6e54cd498c9e59ca83bc', + 'release_date': '20230601', + 'upload_date': '20210818', + 'modified_date': '20210903', + 'release_timestamp': 1685633400, + 'timestamp': 1629318000, + 'modified_timestamp': 1630686000, + 'release_year': 2013, + 'episode': 'Episode 1', + 'episode_id': '1608311390', + 'episode_number': 1, + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Нюхач', + 'series_id': '1608311387', + }, + }, { + 'note': 'Jupiter+: Podcast: lesnye-istorii-aisty', + 'url': 'https://jupiterpluss.err.ee/1608990335/lesnye-istorii-aisty', + 'md5': '8b46d7e4510b254a14b7a52211b5bf96', + 'info_dict': { + 'id': '1608990335', + 'ext': 'm4a', + 'title': 'Лесные истории | Аисты', + 'alt_title': '', + 'description': 'md5:065e721623e271e7a63e6540d409ca6b', + 'release_date': '20230609', + 'upload_date': '20230527', + 'modified_date': '20230608', + 'release_timestamp': 1686308700, + 'timestamp': 1685145600, + 'modified_timestamp': 1686252600, + 'release_year': 2023, + 'episode': 'Episode 0', + 'episode_id': '1608990335', + 'episode_number': 0, + 'season': 'Season 0', + 'season_number': 0, + 'series': 'Лесные истории | Аисты', + 'series_id': '1037497', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://services.err.ee/api/v2/vodContent/getContentPageData', video_id, + query={'contentId': video_id})['data']['mainContent'] + + media_data = traverse_obj(data, ('medias', ..., {dict}), get_all=False) + if traverse_obj(media_data, ('restrictions', 'drm', {bool})): + self.report_drm(video_id) + + formats, subtitles = [], {} + for format_url in set(traverse_obj(media_data, ('src', ('hls', 'hls2', 'hlsNew'), {url_or_none}))): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + for format_url in set(traverse_obj(media_data, ('src', ('dash', 'dashNew'), {url_or_none}))): + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if format_url := traverse_obj(media_data, ('src', 'file', {url_or_none})): + formats.append({ + 'url': format_url, + 'format_id': 'http', + }) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('heading', {str}), + 'alt_title': ('subHeading', {str}), + 'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}), + 'timestamp': ('created', {int_or_none}), + 'modified_timestamp': ('updated', {int_or_none}), + 'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}), + 'release_year': ('year', {int_or_none}), + }, get_all=False), + **(traverse_obj(data, { + 'series': ('heading', {str}), + 'series_id': ('rootContentId', {str_or_none}), + 'episode': ('subHeading', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'episode_id': ('id', {str_or_none}), + }) if data.get('type') == 'episode' else {}), + } From 02e343f6ef6d7b3f9087ff69e4a1db0b4b4a5c5d Mon Sep 17 00:00:00 2001 From: Danish Humair Date: Mon, 29 Jan 2024 02:23:52 +0500 Subject: [PATCH 53/61] [ie/MedalTV] Fix extraction (#9098) Closes #8766 Authored by: Danish-H --- yt_dlp/extractor/medaltv.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index 9e57ee21af..eeb5b85f38 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -8,7 +8,8 @@ float_or_none, int_or_none, str_or_none, - traverse_obj + traverse_obj, + update_url_query, ) @@ -16,7 +17,7 @@ class MedalTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K', - 'md5': '6930f8972914b6b9fdc2bb3918098ba0', + 'md5': '03e4911fdcf7fce563090705c2e79267', 'info_dict': { 'id': 'jTBFnLKdLy15K', 'ext': 'mp4', @@ -33,8 +34,8 @@ class MedalTVIE(InfoExtractor): 'duration': 13, } }, { - 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2mA60jWAGQCBH', - 'md5': '3d19d426fe0b2d91c26e412684e66a06', + 'url': 'https://medal.tv/games/cod-cold-war/clips/2mA60jWAGQCBH', + 'md5': 'fc7a3e4552ae8993c1c4006db46be447', 'info_dict': { 'id': '2mA60jWAGQCBH', 'ext': 'mp4', @@ -52,7 +53,7 @@ class MedalTVIE(InfoExtractor): 'duration': 23, } }, { - 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2um24TWdty0NA', + 'url': 'https://medal.tv/games/cod-cold-war/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { 'id': '2um24TWdty0NA', @@ -81,7 +82,7 @@ class MedalTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(update_url_query(url, {'mobilebypass': 'true'}), video_id) hydration_data = self._search_json( r']*>[^<]*\bhydrationData\s*=', webpage, From 41b6cdb4197aaf7ad82bdad6885eb5d5c64acd74 Mon Sep 17 00:00:00 2001 From: Nur Mahmud Ul Alam Tasin <62534505+NurTasin@users.noreply.github.com> Date: Mon, 29 Jan 2024 04:33:44 +0600 Subject: [PATCH 54/61] [ie/viewlift] Add support for chorki.com (#9095) Closes #3369 Authored by: NurTasin --- yt_dlp/extractor/viewlift.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py index 8f686f05db..c93be5f3d6 100644 --- a/yt_dlp/extractor/viewlift.py +++ b/yt_dlp/extractor/viewlift.py @@ -12,7 +12,7 @@ class ViewLiftBaseIE(InfoExtractor): _API_BASE = 'https://prod-api.viewlift.com/' - _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb|chorki)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' _SITE_MAP = { 'ftfnext': 'lax', 'funnyforfree': 'snagfilms', @@ -27,6 +27,7 @@ class ViewLiftBaseIE(InfoExtractor): 'snagxtreme': 'snagfilms', 'theidentitytb': 'tampabay', 'vayafilm': 'snagfilms', + 'chorki': 'prothomalo', } _TOKENS = {} @@ -296,6 +297,33 @@ class ViewLiftIE(ViewLiftBaseIE): }, { # Premium movie 'url': 'https://www.hoichoi.tv/movies/detective-2020', 'only_matching': True + }, { # Chorki Premium series + 'url': 'https://www.chorki.com/bn/series/sinpaat', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'bn/series/sinpaat', + }, + }, { # Chorki free movie + 'url': 'https://www.chorki.com/bn/videos/bangla-movie-bikkhov', + 'info_dict': { + 'id': '564e755b-f5c7-4515-aee6-8959bee18c93', + 'title': 'Bikkhov', + 'ext': 'mp4', + 'upload_date': '20230824', + 'timestamp': 1692860553, + 'categories': ['Action Movies', 'Salman Special'], + 'tags': 'count:14', + 'thumbnail': 'https://snagfilms-a.akamaihd.net/dd078ff5-b16e-45e4-9723-501b56b9df0a/images/2023/08/24/1692860450729_1920x1080_16x9Images.jpg', + 'display_id': 'bn/videos/bangla-movie-bikkhov', + 'description': 'md5:71492b086450625f4374a3eb824f27dc', + 'duration': 8002, + }, + 'params': { + 'skip_download': True, + }, + }, { # Chorki Premium movie + 'url': 'https://www.chorki.com/bn/videos/something-like-an-autobiography', + 'only_matching': True, }] @classmethod From 999ea80beb053491089d256104c4188aced3110f Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Mon, 29 Jan 2024 20:38:25 +0100 Subject: [PATCH 55/61] [ie/art19] Add extractors (#9099) Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/art19.py | 303 ++++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+) create mode 100644 yt_dlp/extractor/art19.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2fc1e116ba..f8488d3041 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -138,6 +138,10 @@ ARDMediathekCollectionIE, ARDIE, ) +from .art19 import ( + Art19IE, + Art19ShowIE, +) from .arte import ( ArteTVIE, ArteTVEmbedIE, diff --git a/yt_dlp/extractor/art19.py b/yt_dlp/extractor/art19.py new file mode 100644 index 0000000000..271c505daf --- /dev/null +++ b/yt_dlp/extractor/art19.py @@ -0,0 +1,303 @@ +import re + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj + + +class Art19IE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}' + _VALID_URL = [ + rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P{_UUID_REGEX})', + rf'https?://rss\.art19\.com/episodes/(?P{_UUID_REGEX})\.mp3', + ] + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL[0]})'] + + _TESTS = [{ + 'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3', + 'info_dict': { + 'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', + 'ext': 'mp3', + 'title': 'Why Did DeSantis Drop Out?', + 'series': 'The Daily Briefing', + 'release_timestamp': 1705941275, + 'description': 'md5:da38961da4a3f7e419471365e3c6b49f', + 'episode': 'Episode 582', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d', + 'upload_date': '20240122', + 'timestamp': 1705940815, + 'episode_number': 582, + 'modified_date': '20240122', + 'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', + 'modified_timestamp': 1705941275, + 'release_date': '20240122', + 'duration': 527.4, + }, + }, { + 'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd', + 'info_dict': { + 'id': '8319b776-4153-4d22-8630-631f204a03dd', + 'ext': 'mp3', + 'title': 'Martha Stewart: The Homemaker Hustler Part 2', + 'modified_date': '20240116', + 'upload_date': '20240105', + 'modified_timestamp': 1705435802, + 'episode_id': '8319b776-4153-4d22-8630-631f204a03dd', + 'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'description': 'md5:4aa7cfd1358dc57e729835bc208d7893', + 'release_timestamp': 1705305660, + 'release_date': '20240115', + 'timestamp': 1704481536, + 'episode_number': 88, + 'series': 'Scamfluencers', + 'duration': 2588.37501, + 'episode': 'Episode 88', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html', + 'info_dict': { + 'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', + 'ext': 'mp3', + 'title': "'Verstappen wordt een synoniem voor Formule 1'", + 'season': 'Seizoen 6', + 'description': 'md5:39a7159a31c4cda312b2e893bdd5c071', + 'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', + 'duration': 3061.82111, + 'series_id': '93f4e113-2a60-4609-a564-755058fa40d8', + 'release_date': '20231126', + 'modified_timestamp': 1701156004, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'season_number': 6, + 'episode_number': 52, + 'modified_date': '20231128', + 'upload_date': '20231126', + 'timestamp': 1701025981, + 'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26', + 'series': 'De Boordradio', + 'release_timestamp': 1701026308, + 'episode': 'Episode 52', + }, + }, { + 'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/', + 'info_dict': { + 'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', + 'ext': 'mp3', + 'title': 'Larry Bucshon announces retirement from congress', + 'upload_date': '20240115', + 'episode_number': 148, + 'episode': 'Episode 148', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'release_date': '20240115', + 'timestamp': 1705328205, + 'release_timestamp': 1705329275, + 'series': 'All INdiana Politics', + 'modified_date': '20240117', + 'modified_timestamp': 1705458901, + 'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1', + 'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', + 'description': 'md5:53b5239e4d14973a87125c217c255b2a', + 'duration': 1256.18848, + }, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for episode_id in re.findall( + rf']+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage): + yield f'https://rss.art19.com/episodes/{episode_id}.mp3' + + def _real_extract(self, url): + episode_id = self._match_id(url) + + player_metadata = self._download_json( + f'https://art19.com/episodes/{episode_id}', episode_id, + note='Downloading player metadata', fatal=False, + headers={'Accept': 'application/vnd.art19.v0+json'}) + rss_metadata = self._download_json( + f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False, + note='Downloading RSS metadata') + + formats = [{ + 'format_id': 'direct', + 'url': f'https://rss.art19.com/episodes/{episode_id}.mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }] + for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)): + if fmt_id == 'waveform_bin': + continue + fmt_url = traverse_obj(fmt_data, ('url', {url_or_none})) + if not fmt_url: + continue + formats.append({ + 'format_id': fmt_id, + 'url': fmt_url, + 'vcodec': 'none', + 'acodec': fmt_id, + 'quality': -2 if fmt_id == 'ogg' else -1, + }) + + return { + 'id': episode_id, + 'formats': formats, + **traverse_obj(player_metadata, ('episode', { + 'title': ('title', {str}), + 'description': ('description_plain', {str}), + 'episode_id': ('id', {str}), + 'episode_number': ('episode_number', {int_or_none}), + 'season_id': ('season_id', {str}), + 'series_id': ('series_id', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'release_timestamp': ('released_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}) + })), + **traverse_obj(rss_metadata, ('content', { + 'title': ('episode_title', {str}), + 'description': ('episode_description_plain', {str}), + 'episode_id': ('episode_id', {str}), + 'episode_number': ('episode_number', {int_or_none}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', {int_or_none}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'thumbnail': ('cover_image', {url_or_none}), + 'duration': ('duration', {float_or_none}), + })), + } + + +class Art19ShowIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P[\w-]+)(?:/embed)?/?' + _VALID_URL = [ + rf'{_VALID_URL_BASE}(?:$|[#?])', + r'https?://rss\.art19\.com/(?P[\w-]+)/?(?:$|[#?])', + ] + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL_BASE}[^\'"])'] + + _TESTS = [{ + 'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/', + 'info_dict': { + '_type': 'playlist', + 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', + 'display_id': 'echt-gebeurd', + 'title': 'Echt Gebeurd', + 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560', + 'timestamp': 1492642167, + 'upload_date': '20170419', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:7', + }, + 'playlist_mincount': 425, + }, { + 'url': 'https://www.art19.com/shows/echt-gebeurd', + 'info_dict': { + '_type': 'playlist', + 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', + 'display_id': 'echt-gebeurd', + 'title': 'Echt Gebeurd', + 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560', + 'timestamp': 1492642167, + 'upload_date': '20170419', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:7', + }, + 'playlist_mincount': 425, + }, { + 'url': 'https://rss.art19.com/scamfluencers', + 'info_dict': { + '_type': 'playlist', + 'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', + 'display_id': 'scamfluencers', + 'title': 'Scamfluencers', + 'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7', + 'timestamp': 1647368573, + 'upload_date': '20220315', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': [], + }, + 'playlist_mincount': 90, + }, { + 'url': 'https://art19.com/shows/enthuellt/embed', + 'info_dict': { + '_type': 'playlist', + 'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c', + 'display_id': 'enthuellt', + 'title': 'Enthüllt', + 'description': 'md5:17752246643414a2fd51744fc9a1c08e', + 'timestamp': 1601645860, + 'upload_date': '20201002', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:10', + }, + 'playlist_mincount': 10, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast', + 'info_dict': { + '_type': 'playlist', + 'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21', + 'display_id': 'deconstructing-yourself', + 'title': 'Deconstructing Yourself', + 'description': 'md5:dab5082b28b248a35476abf64768854d', + 'timestamp': 1570581181, + 'upload_date': '20191009', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:5', + }, + 'playlist_mincount': 80, + }, { + 'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/', + 'info_dict': { + '_type': 'playlist', + 'id': '9dfa2c37-ab87-4c13-8388-4897914313ec', + 'display_id': 'the-ben-joravsky-show', + 'title': 'The Ben Joravsky Show', + 'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a', + 'timestamp': 1550875095, + 'upload_date': '20190222', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'], + }, + 'playlist_mincount': 1900, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for series_id in re.findall( + r']+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage): + yield f'https://art19.com/shows/{series_id}' + + def _real_extract(self, url): + series_id = self._match_id(url) + series_metadata = self._download_json( + f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata', + headers={'Accept': 'application/vnd.art19.v0+json'}) + + return { + '_type': 'playlist', + 'entries': [ + self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE) + for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str})) + ], + **traverse_obj(series_metadata, ('series', { + 'id': ('id', {str}), + 'display_id': ('slug', {str}), + 'title': ('title', {str}), + 'description': ('description_plain', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + })), + 'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})), + } From 9b5efaf86b99a2664fff9fc725d275f766c3221d Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Tue, 30 Jan 2024 03:43:41 +0800 Subject: [PATCH 56/61] [ie/facebook] Support events (#9055) Closes #5355 Authored by: kclauhk --- yt_dlp/extractor/facebook.py | 77 +++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index d186b57bf5..830bbcc3c0 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -54,6 +54,7 @@ class FacebookIE(InfoExtractor): )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| + events/(?:[^/]+/)?| groups/[^/]+/(?:permalink|posts)/| watchparty/ )| @@ -399,6 +400,18 @@ class FacebookIE(InfoExtractor): }, 'playlist_count': 1, 'skip': 'Requires logging in', + }, { + # data.event.cover_media_renderer.cover_video + 'url': 'https://m.facebook.com/events/1509582499515440', + 'info_dict': { + 'id': '637246984455045', + 'ext': 'mp4', + 'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"', + 'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022', + 'thumbnail': r're:^https?://.*', + 'uploader': 'Comitato Liberi Pensatori', + 'uploader_id': '100065709540881', + }, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' _api_config = { @@ -473,38 +486,10 @@ def extract_metadata(webpage): r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - - automatic_captions, subtitles = {}, {} - subs_data = traverse_obj(post, (..., 'video', ..., 'attachments', ..., lambda k, v: ( - k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video'))) - is_video_broadcast = get_first(subs_data, 'is_video_broadcast', expected_type=bool) - captions = get_first(subs_data, 'video_available_captions_locales', 'captions_url') - if url_or_none(captions): # if subs_data only had a 'captions_url' - locale = self._html_search_meta(['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') - subtitles[locale] = [{'url': captions}] - # or else subs_data had 'video_available_captions_locales', a list of dicts - for caption in traverse_obj(captions, ( - {lambda x: sorted(x, key=lambda c: c['locale'])}, lambda _, v: v['captions_url']) - ): - lang = caption.get('localized_language') or '' - subs = { - 'url': caption['captions_url'], - 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), - } - if caption.get('localized_creation_method') or is_video_broadcast: - automatic_captions.setdefault(caption['locale'], []).append(subs) - else: - subtitles.setdefault(caption['locale'], []).append(subs) - media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - uploader_data = ( - get_first(media, ('owner', {dict})) - or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) - or get_first(post, ('node', 'actors', ..., {dict})) or {}) - page_title = title or self._html_search_regex(( r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', r'(?s)(?P.*?)', @@ -513,11 +498,15 @@ def extract_metadata(webpage): description = description or self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) + or get_first(post, ('event', 'event_creator', {dict})) or {}) uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) @@ -539,8 +528,6 @@ def extract_metadata(webpage): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), - 'automatic_captions': automatic_captions, - 'subtitles': subtitles, } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -638,6 +625,29 @@ def parse_graphql_video(video): 'url': playable_url, }) extract_dash_manifest(video, formats) + + automatic_captions, subtitles = {}, {} + is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) + for caption in traverse_obj(video, ( + 'video_available_captions_locales', + {lambda x: sorted(x, key=lambda c: c['locale'])}, + lambda _, v: url_or_none(v['captions_url']) + )): + lang = caption.get('localized_language') or 'und' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), + } + if caption.get('localized_creation_method') or is_broadcast: + automatic_captions.setdefault(caption['locale'], []).append(subs) + else: + subtitles.setdefault(caption['locale'], []).append(subs) + captions_url = traverse_obj(video, ('captions_url', {url_or_none})) + if captions_url and not automatic_captions and not subtitles: + locale = self._html_search_meta( + ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] + info = { 'id': v_id, 'formats': formats, @@ -647,6 +657,8 @@ def parse_graphql_video(video): 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) or float_or_none(video.get('length_in_second'))), + 'automatic_captions': automatic_captions, + 'subtitles': subtitles, } process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) @@ -681,7 +693,8 @@ def parse_attachment(attachment, key='media'): for edge in edges: parse_attachment(edge, key='node') - video = data.get('video') or {} + video = traverse_obj(data, ( + 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {} if video: attachments = try_get(video, [ lambda x: x['story']['attachments'], From 67bb70cd700c8d4c3149cd9e0539a5f32c3d1ce6 Mon Sep 17 00:00:00 2001 From: sepro <4618135+seproDev@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:16:46 +0100 Subject: [PATCH 57/61] [ie/Vbox7] Fix extractor (#9100) Closes #1098, Closes #5661 Authored by: seproDev --- yt_dlp/extractor/vbox7.py | 82 ++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/yt_dlp/extractor/vbox7.py b/yt_dlp/extractor/vbox7.py index be35dad1c3..21bf4232b5 100644 --- a/yt_dlp/extractor/vbox7.py +++ b/yt_dlp/extractor/vbox7.py @@ -1,5 +1,6 @@ from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ExtractorError, base_url, int_or_none, url_basename +from ..utils.traversal import traverse_obj class Vbox7IE(InfoExtractor): @@ -19,7 +20,7 @@ class Vbox7IE(InfoExtractor): _GEO_COUNTRIES = ['BG'] _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', - 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', + 'md5': '50ca1f78345a9c15391af47d8062d074', 'info_dict': { 'id': '0946fff23c', 'ext': 'mp4', @@ -29,19 +30,25 @@ class Vbox7IE(InfoExtractor): 'timestamp': 1470982814, 'upload_date': '20160812', 'uploader': 'zdraveibulgaria', - }, - 'params': { - 'proxy': '127.0.0.1:8118', + 'view_count': int, + 'duration': 2640, }, }, { 'url': 'http://vbox7.com/play:249bb972c2', - 'md5': '99f65c0c9ef9b682b97313e052734c3f', + 'md5': 'da1dd2eb245200cb86e6d09d43232116', 'info_dict': { 'id': '249bb972c2', 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', + 'uploader': 'svideteliat_ot_varshava', + 'view_count': int, + 'timestamp': 1360215023, + 'thumbnail': 'https://i49.vbox7.com/design/iconci/png/noimg6.png', + 'description': 'Смях! Чудо - чист за секунди - Скрита камера', + 'upload_date': '20130207', + 'duration': 83, }, - 'skip': 'georestricted', + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', 'only_matching': True, @@ -53,41 +60,38 @@ class Vbox7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - response = self._download_json( - 'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id, - video_id) + data = self._download_json( + 'https://www.vbox7.com/aj/player/item/options', video_id, + query={'vid': video_id})['options'] - if 'error' in response: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, response['error']), expected=True) + src_url = data.get('src') + if src_url in (None, '', 'blank'): + raise ExtractorError('Video is unavailable', expected=True) - video = response['options'] + fmt_base = url_basename(src_url).rsplit('.', 1)[0].rsplit('_', 1)[0] + if fmt_base == 'vn': + self.raise_geo_restricted() - title = video['title'] - video_url = video['src'] + fmt_base = base_url(src_url) + fmt_base - if '/na.mp4' in video_url: - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + formats = self._extract_m3u8_formats( + f'{fmt_base}.m3u8', video_id, m3u8_id='hls', fatal=False) + # TODO: Add MPD formats, when dash range support is added + for res in traverse_obj(data, ('resolutions', lambda _, v: v != 0, {int})): + formats.append({ + 'url': f'{fmt_base}_{res}.mp4', + 'format_id': f'http-{res}', + 'height': res, + }) - uploader = video.get('uploader') - - webpage = self._download_webpage( - 'http://vbox7.com/play:%s' % video_id, video_id, fatal=None) - - info = {} - - if webpage: - info = self._search_json_ld( - webpage.replace('"/*@context"', '"@context"'), video_id, - fatal=False) - - info.update({ + return { 'id': video_id, - 'title': title, - 'url': video_url, - 'uploader': uploader, - 'thumbnail': self._proto_relative_url( - info.get('thumbnail') or self._og_search_thumbnail(webpage), - 'http:'), - }) - return info + 'formats': formats, + **self._search_json_ld(self._download_webpage( + f'https://www.vbox7.com/play:{video_id}', video_id, fatal=False) or '', video_id, fatal=False), + **traverse_obj(data, { + 'title': ('title', {str}), + 'uploader': ('uploader', {str}), + 'duration': ('duration', {int_or_none}), + }), + } From 3725b4f0c93ca3943e6300013a9670e4ab757fda Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Wed, 31 Jan 2024 09:35:35 +0100 Subject: [PATCH 58/61] [core] Add `--compat-options 2023` (#9084) Authored by: Grub4K --- README.md | 3 ++- yt_dlp/options.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b6a79667c3..7dc3bb2f6c 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,8 @@ ### Differences in default behavior * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress` +* `--compat-options 2023`: Same as `--compat-options prefer-legacy-http-handler,manifest-filesize-approx`. Use this to enable all future compat options # INSTALLATION diff --git a/yt_dlp/options.py b/yt_dlp/options.py index e9d927717e..9bea6549d7 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -476,7 +476,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': ['no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], + '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter'], + '2023': ['prefer-legacy-http-handler', 'manifest-filesize-approx'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' From cbed249aaa053a3f425b9bafc97f8dbd71c44487 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Wed, 31 Jan 2024 09:43:52 +0100 Subject: [PATCH 59/61] [cookies] Fix `--cookies-from-browser` for `snap` Firefox (#9016) Authored by: Grub4K --- yt_dlp/cookies.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index eac033e391..a92ab41645 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,6 +1,7 @@ import base64 import collections import contextlib +import glob import http.cookiejar import http.cookies import io @@ -122,13 +123,14 @@ def _extract_firefox_cookies(profile, container, logger): return YoutubeDLCookieJar() if profile is None: - search_root = _firefox_browser_dir() + search_roots = list(_firefox_browser_dirs()) elif _is_path(profile): - search_root = profile + search_roots = [profile] else: - search_root = os.path.join(_firefox_browser_dir(), profile) + search_roots = [os.path.join(path, profile) for path in _firefox_browser_dirs()] + search_root = ', '.join(map(repr, search_roots)) - cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger) + cookie_database_path = _newest(_firefox_cookie_dbs(search_roots)) if cookie_database_path is None: raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') logger.debug(f'Extracting cookies from: "{cookie_database_path}"') @@ -182,12 +184,21 @@ def _extract_firefox_cookies(profile, container, logger): cursor.connection.close() -def _firefox_browser_dir(): +def _firefox_browser_dirs(): if sys.platform in ('cygwin', 'win32'): - return os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') + yield os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') + elif sys.platform == 'darwin': - return os.path.expanduser('~/Library/Application Support/Firefox/Profiles') - return os.path.expanduser('~/.mozilla/firefox') + yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles') + + else: + yield from map(os.path.expanduser, ('~/.mozilla/firefox', '~/snap/firefox/common/.mozilla/firefox')) + + +def _firefox_cookie_dbs(roots): + for root in map(os.path.abspath, roots): + for pattern in ('', '*/', 'Profiles/*/'): + yield from glob.iglob(os.path.join(root, pattern, 'cookies.sqlite')) def _get_chromium_based_browser_settings(browser_name): @@ -268,7 +279,7 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): logger.error(f'{browser_name} does not support profiles') search_root = config['browser_dir'] - cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies', logger) + cookie_database_path = _newest(_find_files(search_root, 'Cookies', logger)) if cookie_database_path is None: raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"') logger.debug(f'Extracting cookies from: "{cookie_database_path}"') @@ -947,7 +958,7 @@ def _get_windows_v10_key(browser_root, logger): References: - [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc """ - path = _find_most_recently_used_file(browser_root, 'Local State', logger) + path = _newest(_find_files(browser_root, 'Local State', logger)) if path is None: logger.error('could not find local state file') return None @@ -1049,17 +1060,20 @@ def _get_column_names(cursor, table_name): return [row[1].decode() for row in table_info] -def _find_most_recently_used_file(root, filename, logger): +def _newest(files): + return max(files, key=lambda path: os.lstat(path).st_mtime, default=None) + + +def _find_files(root, filename, logger): # if there are multiple browser profiles, take the most recently used one - i, paths = 0, [] + i = 0 with _create_progress_bar(logger) as progress_bar: - for curr_root, dirs, files in os.walk(root): + for curr_root, _, files in os.walk(root): for file in files: i += 1 progress_bar.print(f'Searching for "{filename}": {i: 6d} files searched') if file == filename: - paths.append(os.path.join(curr_root, file)) - return None if not paths else max(paths, key=lambda path: os.lstat(path).st_mtime) + yield os.path.join(curr_root, file) def _merge_cookie_jars(jars): @@ -1073,7 +1087,7 @@ def _merge_cookie_jars(jars): def _is_path(value): - return os.path.sep in value + return any(sep in value for sep in (os.path.sep, os.path.altsep) if sep) def _parse_browser_specification(browser_name, profile=None, keyring=None, container=None): From 2792092afd367e39251ace1fb2819c855ab8919f Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Wed, 31 Jan 2024 09:56:14 +0100 Subject: [PATCH 60/61] [cookies] Improve error message for Windows `--cookies-from-browser chrome` issue (#9080) Authored by: Grub4K --- yt_dlp/cookies.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index a92ab41645..deb2e35f23 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -24,7 +24,8 @@ aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import functools +from .compat import functools # isort: split +from .compat import compat_os_name from .dependencies import ( _SECRETSTORAGE_UNAVAILABLE_REASON, secretstorage, @@ -32,6 +33,7 @@ ) from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( + DownloadError, Popen, error_to_str, expand_path, @@ -318,6 +320,12 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): counts['unencrypted'] = unencrypted_cookies logger.debug(f'cookie version breakdown: {counts}') return jar + except PermissionError as error: + if compat_os_name == 'nt' and error.errno == 13: + message = 'Could not copy Chrome cookie database. See https://github.com/yt-dlp/yt-dlp/issues/7271 for more info' + logger.error(message) + raise DownloadError(message) # force exit + raise finally: if cursor is not None: cursor.connection.close() From d63eae7e7ffb1f3e733e552b9e5e82355bfba214 Mon Sep 17 00:00:00 2001 From: bashonly Date: Wed, 31 Jan 2024 03:11:41 -0600 Subject: [PATCH 61/61] [core] Don't select storyboard formats as fallback Closes #7715 Authored by: bashonly --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 5dcefb5b81..e7d654d0f2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2451,7 +2451,7 @@ def selector_function(ctx): # for extractors with incomplete formats (audio only (soundcloud) # or video only (imgur)) best/worst will fallback to # best/worst {video,audio}-only format - matches = formats + matches = list(filter(lambda f: f.get('vcodec') != 'none' or f.get('acodec') != 'none', formats)) elif seperate_fallback and not ctx['has_merged_format']: # for compatibility with youtube-dl when there is no pre-merged format matches = list(filter(seperate_fallback, formats))