From 176a068cde4f2d9dfa0336168caead0b1edcb8ac Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 16 Jan 2023 15:38:33 -0600 Subject: [PATCH 001/871] [extractor/nbc] Fix XML parsing Python 3.7 compat bug in cb73b8460c3ce6d37ab651a4e44bb23b10056154 Authored by: bashonly --- yt_dlp/extractor/nbc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 82d759f75..b9f65e927 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformIE, default_ns from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( @@ -700,7 +700,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) nbc_data = self._search_json( - r'' + _ANVATO_PREFIX = 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) item = video_config['playlist'][0] mcp_id = item.get('mcpID') if mcp_id: - info = self.url_result( - 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id, - 'Anvato', mcp_id) + info = self.url_result(f'{self._ANVATO_PREFIX}{mcp_id}', AnvatoIE, mcp_id) else: media_id = item.get('id') or item['entityId'] title = item.get('title') @@ -157,3 +164,138 @@ def _real_extract(self, url): 'nfl-c-article__title', webpage)) or self._html_search_meta( ['og:title', 'twitter:title'], webpage) return self.playlist_result(entries, display_id, title) + + +class NFLPlusReplayIE(NFLBaseIE): + IE_NAME = 'nfl.com:plus:replay' + _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/[\w-]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', + 'info_dict': { + 'id': '1572108', + 'ext': 'mp4', + 'title': 'New York Giants at Minnesota Vikings', + 'description': 'New York Giants play the Minnesota Vikings at U.S. Bank Stadium on January 15, 2023', + 'uploader': 'NFL', + 'upload_date': '20230116', + 'timestamp': 1673864520, + 'duration': 7157, + 'categories': ['Game Highlights'], + 'tags': ['Minnesota Vikings', 'New York Giants', 'Minnesota Vikings vs. New York Giants'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + +class NFLPlusEpisodeIE(NFLBaseIE): + IE_NAME = 'nfl.com:plus:episode' + _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P[\w-]+)' + _TESTS = [{ + 'note': 'premium content', + 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', + 'info_dict': { + 'id': '1576832', + 'ext': 'mp4', + 'title': 'Kurt\'s QB Insider: Conference Championships', + 'description': 'md5:944f7fab56f7a37430bf8473f5473857', + 'uploader': 'NFL', + 'upload_date': '20230127', + 'timestamp': 1674782760, + 'duration': 730, + 'categories': ['Analysis'], + 'tags': ['Cincinnati Bengals at Kansas City Chiefs (2022-POST-3)'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _CLIENT_DATA = { + 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', + 'clientSecret': 'CZuvCL49d9OwfGsR', + 'deviceId': str(uuid.uuid4()), + 'deviceInfo': base64.b64encode(json.dumps({ + 'model': 'desktop', + 'version': 'Chrome', + 'osName': 'Windows', + 'osVersion': '10.0', + }, separators=(',', ':')).encode()).decode(), + 'networkType': 'other', + 'nflClaimGroupsToAdd': [], + 'nflClaimGroupsToRemove': [], + } + _ACCOUNT_INFO = {} + _API_KEY = None + + _TOKEN = None + _TOKEN_EXPIRY = 0 + + def _get_account_info(self, url, video_id): + cookies = self._get_cookies('https://www.nfl.com/') + login_token = traverse_obj(cookies, ( + (f'glt_{self._API_KEY}', f'gig_loginToken_{self._API_KEY}', + lambda k, _: k.startswith('glt_') or k.startswith('gig_loginToken_')), + {lambda x: x.value}), get_all=False) + if not login_token: + self.raise_login_required() + + account = self._download_json( + 'https://auth-id.nfl.com/accounts.getAccountInfo', video_id, + note='Downloading account info', data=urlencode_postdata({ + 'include': 'profile,data', + 'lang': 'en', + 'APIKey': self._API_KEY, + 'sdk': 'js_latest', + 'login_token': login_token, + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': traverse_obj(cookies, ( + 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='13642'), + 'format': 'json', + }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + self._ACCOUNT_INFO = traverse_obj(account, { + 'signatureTimestamp': 'signatureTimestamp', + 'uid': 'UID', + 'uidSignature': 'UIDSignature', + }) + + if len(self._ACCOUNT_INFO) != 3: + raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) + + def _get_auth_token(self, url, video_id): + if not self._ACCOUNT_INFO: + self._get_account_info(url, video_id) + + token = self._download_json( + 'https://api.nfl.com/identity/v3/token%s' % ( + '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), + video_id, headers={'Content-Type': 'application/json'}, note='Downloading access token', + data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) + + self._TOKEN = token['accessToken'] + self._TOKEN_EXPIRY = token['expiresIn'] + self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] + + def _real_extract(self, url): + slug = self._match_id(url) + + if not self._API_KEY: + webpage = self._download_webpage(url, slug, fatal=False) or '' + self._API_KEY = self._search_regex( + r'window\.gigyaApiKey=["\'](\w+)["\'];', webpage, 'API key', + default='3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f') + + if not self._TOKEN or self._TOKEN_EXPIRY <= int(time.time()): + self._get_auth_token(url, slug) + + video_id = self._download_json( + f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={ + 'Authorization': f'Bearer {self._TOKEN}', + })['mcpPlaybackId'] + + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) From 9ebac35577e61c3d25fafc959655fa3ab04ca7ef Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 16 Feb 2023 17:06:48 +0530 Subject: [PATCH 064/871] Bugfix for 39f32f1715c0dffb7626dda7307db6388bb7abaa when `--ignore-no-formats-error` --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index d214a6449..33b4fb3ca 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2557,7 +2557,7 @@ def sanitize_numeric_fields(info): formats = self._get_formats(info_dict) # Backward compatibility with InfoExtractor._sort_formats - field_preference = formats[0].pop('__sort_fields', None) + field_preference = (formats or [{}])[0].pop('__sort_fields', None) if field_preference: info_dict['_format_sort_fields'] = field_preference From 149eb0bbf34fa8fdf8d1e2aa28e17479d099e26b Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 16 Feb 2023 08:51:45 -0600 Subject: [PATCH 065/871] [extractor/youtube] Fix `uploader_id` extraction Closes #6247 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 95ca52b3a..4dde4bbaa 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4120,7 +4120,7 @@ def is_bad_format(fmt): 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')), 'description': video_description, 'uploader': get_first(video_details, 'author'), - 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_id': self._search_regex(r'/(?:channel/|user/|@)([^/?&#]+)', owner_profile_url, 'uploader id', default=None), 'uploader_url': owner_profile_url, 'channel_id': channel_id, 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'), From c9d14bd22ab31e2a41f9f8061843668a06db583b Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Thu, 16 Feb 2023 15:54:11 +0100 Subject: [PATCH 066/871] [extractor/crunchyroll] Fix incorrect premium-only error Closes #6234 Authored by: Grub4K --- yt_dlp/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 7d356d673..1abffcd74 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -160,7 +160,7 @@ def _real_extract(self, url): episode_response = self._download_json( f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not episode_response.get('playback'): + if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'): if self.is_logged_in: raise ExtractorError('This video is for premium members only', expected=True) else: From 376aa24b1541e2bfb23337c0ae9bafa5bb3787f1 Mon Sep 17 00:00:00 2001 From: Siddhartha Sahu Date: Thu, 16 Feb 2023 14:55:01 -0500 Subject: [PATCH 067/871] Improve default subtitle language selection (#6240) Authored by: sdht0 --- yt_dlp/YoutubeDL.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 33b4fb3ca..4e5c40b58 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2810,10 +2810,14 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True) except re.error as e: raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}') - elif normal_sub_langs: - requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1] else: - requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1] + requested_langs = LazyList(itertools.chain( + ['en'] if 'en' in normal_sub_langs else [], + filter(lambda f: f.startswith('en'), normal_sub_langs), + ['en'] if 'en' in all_sub_langs else [], + filter(lambda f: f.startswith('en'), all_sub_langs), + normal_sub_langs, all_sub_langs, + ))[:1] if requested_langs: self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}') From 72671a212d7c939329cb5d34335fa089dd3acbd3 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 11:57:52 +0900 Subject: [PATCH 068/871] [extractor/viu] Add `ViuOTTIndonesiaIE` extractor (#6099) Authored by: HobbyistDev Closes #1757 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/viu.py | 146 ++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 061a25a4e..081696855 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2181,6 +2181,7 @@ ViuIE, ViuPlaylistIE, ViuOTTIE, + ViuOTTIndonesiaIE, ) from .vk import ( VKIE, diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index dd4cad7ba..6f9af9f64 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -9,9 +9,12 @@ from ..utils import ( ExtractorError, int_or_none, + remove_end, strip_or_none, + traverse_obj, try_get, smuggle_url, + unified_timestamp, unsmuggle_url, url_or_none, ) @@ -394,3 +397,146 @@ def download_playback(): 'formats': formats, 'subtitles': subtitles, } + + +class ViuOTTIndonesiaBaseIE(InfoExtractor): + _BASE_QUERY = { + 'ver': 1.0, + 'fmt': 'json', + 'aver': 5.0, + 'appver': 2.0, + 'appid': 'viu_desktop', + 'platform': 'desktop', + } + + _DEVICE_ID = str(uuid.uuid4()) + _SESSION_ID = str(uuid.uuid4()) + _TOKEN = None + + _HEADERS = { + 'x-session-id': _SESSION_ID, + 'x-client': 'browser' + } + + _AGE_RATINGS_MAPPER = { + 'ADULTS': 18, + 'teens': 13 + } + + def _real_initialize(self): + ViuOTTIndonesiaBaseIE._TOKEN = self._download_json( + 'https://um.viuapi.io/user/identity', None, + headers={'Content-type': 'application/json', **self._HEADERS}, + query={**self._BASE_QUERY, 'iid': self._DEVICE_ID}, + data=json.dumps({'deviceId': self._DEVICE_ID}).encode(), + note='Downloading token information')['token'] + + +class ViuOTTIndonesiaIE(ViuOTTIndonesiaBaseIE): + _VALID_URL = r'https?://www\.viu\.com/ott/\w+/\w+/all/video-[\w-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-drama-tv_shows-detective_conan_episode_793-1165863142?containerId=playlist-26271226', + 'info_dict': { + 'id': '1165863142', + 'ext': 'mp4', + 'episode_number': 793, + 'episode': 'Episode 793', + 'title': 'Detective Conan - Episode 793', + 'duration': 1476, + 'description': 'md5:b79d55345bc1e0217ece22616267c9a5', + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1165863189/d-1', + 'upload_date': '20210101', + 'timestamp': 1609459200, + } + }, { + 'url': 'https://www.viu.com/ott/id/id/all/video-korean-reality-tv_shows-entertainment_weekly_episode_1622-1118617054', + 'info_dict': { + 'id': '1118617054', + 'ext': 'mp4', + 'episode_number': 1622, + 'episode': 'Episode 1622', + 'description': 'md5:6d68ca450004020113e9bf27ad99f0f8', + 'title': 'Entertainment Weekly - Episode 1622', + 'duration': 4729, + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1120187848/d-1', + 'timestamp': 1420070400, + 'upload_date': '20150101', + 'cast': ['Shin Hyun-joon', 'Lee Da-Hee'] + } + }, { + # age-limit test + 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-trailer-tv_shows-trailer_jujutsu_kaisen_ver_01-1166044219?containerId=playlist-26273140', + 'info_dict': { + 'id': '1166044219', + 'ext': 'mp4', + 'upload_date': '20200101', + 'timestamp': 1577836800, + 'title': 'Trailer \'Jujutsu Kaisen\' Ver.01', + 'duration': 92, + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1166044240/d-1', + 'description': 'Trailer \'Jujutsu Kaisen\' Ver.01', + 'cast': ['Junya Enoki', ' Yûichi Nakamura', ' Yuma Uchida', 'Asami Seto'], + 'age_limit': 13, + } + }, { + # json ld metadata type equal to Movie instead of TVEpisodes + 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-animation-movies-demon_slayer_kimetsu_no_yaiba_the_movie_mugen_train-1165892707?containerId=1675060691786', + 'info_dict': { + 'id': '1165892707', + 'ext': 'mp4', + 'timestamp': 1577836800, + 'upload_date': '20200101', + 'title': 'Demon Slayer - Kimetsu no Yaiba - The Movie: Mugen Train', + 'age_limit': 13, + 'cast': 'count:9', + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1165895279/d-1', + 'description': 'md5:1ce9c35a3aeab384085533f746c87469', + 'duration': 7021, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_data = self._download_json( + f'https://um.viuapi.io/drm/v1/content/{display_id}', display_id, data=b'', + headers={'Authorization': ViuOTTIndonesiaBaseIE._TOKEN, **self._HEADERS, 'ccode': 'ID'}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['playUrl'], display_id) + + initial_state = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', + display_id)['content']['clipDetails'] + for key, url in initial_state.items(): + lang, ext = self._search_regex( + r'^subtitle_(?P[\w-]+)_(?P\w+)$', key, 'subtitle metadata', + default=(None, None), group=('lang', 'ext')) + if lang and ext: + subtitles.setdefault(lang, []).append({ + 'ext': ext, + 'url': url, + }) + + if ext == 'vtt': + subtitles[lang].append({ + 'ext': 'srt', + 'url': f'{remove_end(initial_state[key], "vtt")}srt', + }) + + episode = traverse_obj(list(filter( + lambda x: x.get('@type') in ('TVEpisode', 'Movie'), self._yield_json_ld(webpage, display_id))), 0) or {} + return { + 'id': display_id, + 'title': (traverse_obj(initial_state, 'title', 'display_title') + or episode.get('name')), + 'description': initial_state.get('description') or episode.get('description'), + 'duration': initial_state.get('duration'), + 'thumbnail': traverse_obj(episode, ('image', 'url')), + 'timestamp': unified_timestamp(episode.get('dateCreated')), + 'formats': formats, + 'subtitles': subtitles, + 'episode_number': (traverse_obj(initial_state, 'episode_no', 'episodeno', expected_type=int_or_none) + or int_or_none(episode.get('episodeNumber'))), + 'cast': traverse_obj(episode, ('actor', ..., 'name'), default=None), + 'age_limit': self._AGE_RATINGS_MAPPER.get(initial_state.get('internal_age_rating')) + } From 10fd9e6ee833c88edf6c633f864f42843a708d32 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:00:07 +0900 Subject: [PATCH 069/871] [extractor/odkmedia] Add `OnDemandChinaEpisodeIE` (#6116) Authored by: HobbyistDev, pukkandan --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/odkmedia.py | 105 ++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 yt_dlp/extractor/odkmedia.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 081696855..86fa117b7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1292,6 +1292,7 @@ from .nzonscreen import NZOnScreenIE from .nzz import NZZIE from .odatv import OdaTVIE +from .odkmedia import OnDemandChinaEpisodeIE from .odnoklassniki import OdnoklassnikiIE from .oftv import ( OfTVIE, diff --git a/yt_dlp/extractor/odkmedia.py b/yt_dlp/extractor/odkmedia.py new file mode 100644 index 000000000..2960860d6 --- /dev/null +++ b/yt_dlp/extractor/odkmedia.py @@ -0,0 +1,105 @@ +import json +import urllib.error + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + GeoRestrictedError, + float_or_none, + traverse_obj, + try_call +) + + +class OnDemandChinaEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://www\.ondemandchina\.com/\w+/watch/(?P[\w-]+)/(?Pep-(?P\d+))' + _TESTS = [{ + 'url': 'https://www.ondemandchina.com/en/watch/together-against-covid-19/ep-1', + 'info_dict': { + 'id': '264394', + 'ext': 'mp4', + 'duration': 3256.88, + 'title': 'EP 1 The Calling', + 'alt_title': '第1集 令出如山', + 'thumbnail': 'https://d2y2efdi5wgkcl.cloudfront.net/fit-in/256x256/media-io/2020/9/11/image.d9816e81.jpg', + 'description': '疫情严峻,党政军民学、东西南北中协同应考', + 'tags': ['Social Humanities', 'Documentary', 'Medical', 'Social'], + } + }] + + _QUERY = ''' + query Episode($programSlug: String!, $episodeNumber: Int!) { + episode( + programSlug: $programSlug + episodeNumber: $episodeNumber + kind: "series" + part: null + ) { + id + title + titleEn + titleKo + titleZhHans + titleZhHant + synopsis + synopsisEn + synopsisKo + synopsisZhHans + synopsisZhHant + videoDuration + images { + thumbnail + } + } + }''' + + def _real_extract(self, url): + program_slug, display_id, ep_number = self._match_valid_url(url).group('series', 'id', 'ep') + webpage = self._download_webpage(url, display_id) + + video_info = self._download_json( + 'https://odc-graphql.odkmedia.io/graphql', display_id, + headers={'Content-type': 'application/json'}, + data=json.dumps({ + 'operationName': 'Episode', + 'query': self._QUERY, + 'variables': { + 'programSlug': program_slug, + 'episodeNumber': int(ep_number), + }, + }).encode())['data']['episode'] + + try: + source_json = self._download_json( + f'https://odkmedia.io/odc/api/v2/playback/{video_info["id"]}/', display_id, + headers={'Authorization': '', 'service-name': 'odc'}) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError): + error_data = self._parse_json(e.cause.read(), display_id)['detail'] + raise GeoRestrictedError(error_data) + + formats, subtitles = [], {} + for source in traverse_obj(source_json, ('sources', ...)): + if source.get('type') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('url'), display_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + self.report_warning(f'Unsupported format {source.get("type")}', display_id) + + return { + 'id': str(video_info['id']), + 'duration': float_or_none(video_info.get('videoDuration'), 1000), + 'thumbnail': (traverse_obj(video_info, ('images', 'thumbnail')) + or self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'title': (traverse_obj(video_info, 'title', 'titleEn') + or self._html_search_meta(['og:title', 'twitter:title'], webpage) + or self._html_extract_title(webpage)), + 'alt_title': traverse_obj(video_info, 'titleKo', 'titleZhHans', 'titleZhHant'), + 'description': (traverse_obj( + video_info, 'synopsisEn', 'synopsisKo', 'synopsisZhHans', 'synopsisZhHant', 'synopisis') + or self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')) + } From a9189510baadf0dccd2d4d363bc6f3a441128bb0 Mon Sep 17 00:00:00 2001 From: OIRNOIR <70721372+OIRNOIR@users.noreply.github.com> Date: Thu, 16 Feb 2023 19:06:16 -0800 Subject: [PATCH 070/871] [extractor/nitter] Update instance list (#6236) Authored by: OIRNOIR --- yt_dlp/extractor/nitter.py | 124 ++++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/nitter.py b/yt_dlp/extractor/nitter.py index 251bf444f..5d1ca1f5d 100644 --- a/yt_dlp/extractor/nitter.py +++ b/yt_dlp/extractor/nitter.py @@ -39,59 +39,99 @@ class NitterIE(InfoExtractor): ) HTTP_INSTANCES = ( - 'nitter.42l.fr', - 'nitter.pussthecat.org', - 'nitter.nixnet.services', + 'nitter.lacontrevoie.fr', 'nitter.fdn.fr', 'nitter.1d4.us', 'nitter.kavin.rocks', 'nitter.unixfox.eu', 'nitter.domain.glass', - 'nitter.eu', 'nitter.namazso.eu', - 'nitter.actionsack.com', 'birdsite.xanny.family', - 'nitter.hu', - 'twitr.gq', 'nitter.moomoo.me', - 'nittereu.moomoo.me', - 'bird.from.tf', + 'bird.trom.tf', 'nitter.it', 'twitter.censors.us', - 'twitter.grimneko.de', - 'nitter.alefvanoon.xyz', - 'n.hyperborea.cloud', - 'nitter.ca', + 'nitter.grimneko.de', 'twitter.076.ne.jp', - 'twitter.mstdn.social', 'nitter.fly.dev', 'notabird.site', 'nitter.weiler.rocks', - 'nitter.silkky.cloud', 'nitter.sethforprivacy.com', - 'nttr.stream', 'nitter.cutelab.space', 'nitter.nl', 'nitter.mint.lgbt', 'nitter.bus-hit.me', - 'fuckthesacklers.network', - 'nitter.govt.land', - 'nitter.datatunnel.xyz', 'nitter.esmailelbob.xyz', 'tw.artemislena.eu', - 'de.nttr.stream', 'nitter.winscloud.net', 'nitter.tiekoetter.com', 'nitter.spaceint.fr', - 'twtr.bch.bar', - 'nitter.exonip.de', - 'nitter.mastodon.pro', - 'nitter.notraxx.ch', - - - # not in the list anymore - 'nitter.skrep.in', - 'nitter.snopyta.org', + 'nitter.privacy.com.de', + 'nitter.poast.org', + 'nitter.bird.froth.zone', + 'nitter.dcs0.hu', + 'twitter.dr460nf1r3.org', + 'nitter.garudalinux.org', + 'twitter.femboy.hu', + 'nitter.cz', + 'nitter.privacydev.net', + 'nitter.evil.site', + 'tweet.lambda.dance', + 'nitter.kylrth.com', + 'nitter.foss.wtf', + 'nitter.priv.pw', + 'nitter.tokhmi.xyz', + 'nitter.catalyst.sx', + 'unofficialbird.com', + 'nitter.projectsegfau.lt', + 'nitter.eu.projectsegfau.lt', + 'singapore.unofficialbird.com', + 'canada.unofficialbird.com', + 'india.unofficialbird.com', + 'nederland.unofficialbird.com', + 'uk.unofficialbird.com', + 'n.l5.ca', + 'nitter.slipfox.xyz', + 'nitter.soopy.moe', + 'nitter.qwik.space', + 'read.whatever.social', + 'nitter.rawbit.ninja', + 'nt.vern.cc', + 'ntr.odyssey346.dev', + 'nitter.ir', + 'nitter.privacytools.io', + 'nitter.sneed.network', + 'n.sneed.network', + 'nitter.manasiwibi.com', + 'nitter.smnz.de', + 'nitter.twei.space', + 'nitter.inpt.fr', + 'nitter.d420.de', + 'nitter.caioalonso.com', + 'nitter.at', + 'nitter.drivet.xyz', + 'nitter.pw', + 'nitter.nicfab.eu', + 'bird.habedieeh.re', + 'nitter.hostux.net', + 'nitter.adminforge.de', + 'nitter.platypush.tech', + 'nitter.mask.sh', + 'nitter.pufe.org', + 'nitter.us.projectsegfau.lt', + 'nitter.arcticfoxes.net', + 't.com.sb', + 'nitter.kling.gg', + 'nitter.ktachibana.party', + 'nitter.riverside.rocks', + 'nitter.girlboss.ceo', + 'nitter.lunar.icu', + 'twitter.moe.ngo', + 'nitter.freedit.eu', + 'ntr.frail.duckdns.org', + 'nitter.librenode.org', + 'n.opnxng.com', + 'nitter.plus.st', ) DEAD_INSTANCES = ( @@ -117,6 +157,32 @@ class NitterIE(InfoExtractor): 'nitter.weaponizedhumiliation.com', 'nitter.vxempire.xyz', 'tweet.lambda.dance', + 'nitter.ca', + 'nitter.42l.fr', + 'nitter.pussthecat.org', + 'nitter.nixnet.services', + 'nitter.eu', + 'nitter.actionsack.com', + 'nitter.hu', + 'twitr.gq', + 'nittereu.moomoo.me', + 'bird.from.tf', + 'twitter.grimneko.de', + 'nitter.alefvanoon.xyz', + 'n.hyperborea.cloud', + 'twitter.mstdn.social', + 'nitter.silkky.cloud', + 'nttr.stream', + 'fuckthesacklers.network', + 'nitter.govt.land', + 'nitter.datatunnel.xyz', + 'de.nttr.stream', + 'twtr.bch.bar', + 'nitter.exonip.de', + 'nitter.mastodon.pro', + 'nitter.notraxx.ch', + 'nitter.skrep.in', + 'nitter.snopyta.org', ) INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES From 65e5c021e7c5f23ecbc6a982b72a02ac6cd6900d Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Fri, 17 Feb 2023 05:08:45 +0200 Subject: [PATCH 071/871] [utils] Don't use Content-length with encoding (#6176) Authored by: felixonmars Closes #3772, #6178 --- yt_dlp/downloader/http.py | 7 ++++++- yt_dlp/utils.py | 3 --- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 95c870ee8..fa72d5722 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -211,7 +211,12 @@ def close_stream(): ctx.stream = None def download(): - data_len = ctx.data.info().get('Content-length', None) + data_len = ctx.data.info().get('Content-length') + + if ctx.data.info().get('Content-encoding'): + # Content-encoding is present, Content-length is not reliable anymore as we are + # doing auto decompression. (See: https://github.com/yt-dlp/yt-dlp/pull/6176) + data_len = None # Range HTTP header may be ignored/unsupported by a webserver # (e.g. extractor/scivee.py, extractor/bambuser.py). diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 7cf151e3a..2d9e61c5b 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1438,19 +1438,16 @@ def http_response(self, req, resp): raise original_ioerror resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - del resp.headers['Content-encoding'] # brotli if resp.headers.get('Content-encoding', '') == 'br': resp = urllib.request.addinfourl( io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - del resp.headers['Content-encoding'] # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # https://github.com/ytdl-org/youtube-dl/issues/6457). if 300 <= resp.code < 400: From da880559a6ecbbf374cc9f3378e696b55b9599af Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Thu, 16 Feb 2023 23:14:33 -0400 Subject: [PATCH 072/871] [extractor/ebay] Add extractor (#6170) Closes #6134 Authored by: JChris246 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ebay.py | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 yt_dlp/extractor/ebay.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 86fa117b7..a9ab66fc7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -507,6 +507,7 @@ ) from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE from .ebaumsworld import EbaumsWorldIE +from .ebay import EbayIE from .echomsk import EchoMskIE from .egghead import ( EggheadCourseIE, diff --git a/yt_dlp/extractor/ebay.py b/yt_dlp/extractor/ebay.py new file mode 100644 index 000000000..d0eb9fc51 --- /dev/null +++ b/yt_dlp/extractor/ebay.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class EbayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ebay\.com/itm/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.ebay.com/itm/194509326719', + 'info_dict': { + 'id': '194509326719', + 'ext': 'mp4', + 'title': 'WiFi internal antenna adhesive for wifi 2.4GHz wifi 5 wifi 6 wifi 6E full bands', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_json = self._search_json(r'"video":', webpage, 'video json', video_id) + + formats = [] + for key, url in video_json['playlistMap'].items(): + if key == 'HLS': + formats.extend(self._extract_m3u8_formats(url, video_id, fatal=False)) + elif key == 'DASH': + formats.extend(self._extract_mpd_formats(url, video_id, fatal=False)) + else: + self.report_warning(f'Unsupported format {key}', video_id) + + return { + 'id': video_id, + 'title': remove_end(self._html_extract_title(webpage), ' | eBay'), + 'formats': formats + } From e4a8b1769e19755acba6d8f212208359905a3159 Mon Sep 17 00:00:00 2001 From: qbnu <93988953+qbnu@users.noreply.github.com> Date: Fri, 17 Feb 2023 03:18:07 +0000 Subject: [PATCH 073/871] [extractor/vocaroo] Add extractor (#6117) Authored by: qbnu, SuperSonicHub1 Closes #6152 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/vocaroo.py | 65 +++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 yt_dlp/extractor/vocaroo.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a9ab66fc7..0a36e98de 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2190,6 +2190,7 @@ VKUserVideosIE, VKWallPostIE, ) +from .vocaroo import VocarooIE from .vodlocker import VodlockerIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE diff --git a/yt_dlp/extractor/vocaroo.py b/yt_dlp/extractor/vocaroo.py new file mode 100644 index 000000000..704e25c22 --- /dev/null +++ b/yt_dlp/extractor/vocaroo.py @@ -0,0 +1,65 @@ +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + float_or_none, +) + + +class VocarooIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:vocaroo\.com|voca\.ro)/(?:embed/)?(?P\w+)' + _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?://)?(?:www\.)?vocaroo\.com/embed/.+?)\1'] + _TESTS = [ + { + 'url': 'https://vocaroo.com/1de8yA3LNe77', + 'md5': 'c557841d5e50261777a6585648adf439', + 'info_dict': { + 'id': '1de8yA3LNe77', + 'ext': 'mp3', + 'title': 'Vocaroo video #1de8yA3LNe77', + 'timestamp': 1675059800.370, + 'upload_date': '20230130', + }, + }, + { + 'url': 'https://vocaroo.com/embed/12WqtjLnpj6g?autoplay=0', + 'only_matching': True, + }, + { + 'url': 'https://voca.ro/12D52rgpzkB0', + 'only_matching': True, + }, + ] + + _WEBPAGE_TESTS = [ + { + 'url': 'https://qbnu.github.io/cool.html', + 'md5': 'f322e529275dd8a47994919eeac404a5', + 'info_dict': { + 'id': '19cgWmKO6AmC', + 'ext': 'mp3', + 'title': 'Vocaroo video #19cgWmKO6AmC', + 'timestamp': 1675093841.408, + 'upload_date': '20230130', + }, + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + if len(audio_id) == 10 or (len(audio_id) == 12 and audio_id[0] == '1'): + media_subdomain = 'media1' + else: + media_subdomain = 'media' + + url = f'https://{media_subdomain}.vocaroo.com/mp3/{audio_id}' + http_headers = {'Referer': 'https://vocaroo.com/'} + resp = self._request_webpage(HEADRequest(url), audio_id, headers=http_headers) + return { + 'id': audio_id, + 'title': '', + 'url': url, + 'ext': 'mp3', + 'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000), + 'vcodec': 'none', + 'http_headers': http_headers, + } From 361630015535026712bdb67f804a15b65ff9ee7e Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:19:24 +0900 Subject: [PATCH 074/871] [extractor/yappy] Add extractor (#6111) Authored by: HobbyistDev Closes #3522 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/yappy.py | 99 +++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 yt_dlp/extractor/yappy.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0a36e98de..4aab6ea78 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2329,6 +2329,7 @@ ZenYandexChannelIE, ) from .yapfiles import YapFilesIE +from .yappy import YappyIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .yle_areena import YleAreenaIE diff --git a/yt_dlp/extractor/yappy.py b/yt_dlp/extractor/yappy.py new file mode 100644 index 000000000..f168bdbf9 --- /dev/null +++ b/yt_dlp/extractor/yappy.py @@ -0,0 +1,99 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, + url_or_none +) + + +class YappyIE(InfoExtractor): + _VALID_URL = r'https?://yappy\.media/video/(?P\w+)' + _TESTS = [{ + 'url': 'https://yappy.media/video/47fea6d8586f48d1a0cf96a7342aabd2', + 'info_dict': { + 'id': '47fea6d8586f48d1a0cf96a7342aabd2', + 'ext': 'mp4', + 'title': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻', + 'timestamp': 1661893200, + 'description': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻', + 'thumbnail': 'https://cdn-st.ritm.media/static/pic/thumbnails/0c7c4d73388f47848acaf540d2e2bb8c-thumbnail.jpg', + 'upload_date': '20220830', + 'view_count': int, + 'like_count': int, + 'uploader_id': '59a0c8c485e5410b9c43474bf4c6a373', + 'categories': ['Образование и наука', 'Лайфхак', 'Технологии', 'Арт/искусство'], + 'repost_count': int, + 'uploader': 'YAPPY', + } + }, { + 'url': 'https://yappy.media/video/3862451954ad4bd58ae2ccefddb0bd33', + 'info_dict': { + 'id': '3862451954ad4bd58ae2ccefddb0bd33', + 'ext': 'mp4', + 'title': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения', + 'timestamp': 1674726985, + 'like_count': int, + 'description': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения', + 'uploader_id': '6793ee3581974a3586fc01e157de6c99', + 'view_count': int, + 'repost_count': int, + 'uploader': 'LENA SHTURMAN', + 'upload_date': '20230126', + 'thumbnail': 'https://cdn-st.ritm.media/static/pic/user_thumbnails/6e76bb4bbad640b6/9ec84c115b2b1967/1674716171.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_ld = self._search_json_ld(webpage, video_id) + nextjs_data = self._search_nextjs_data(webpage, video_id) + + media_data = ( + traverse_obj( + nextjs_data, ('props', 'pageProps', ('data', 'OpenGraphParameters')), get_all=False) + or self._download_json(f'https://yappy.media/api/video/{video_id}', video_id)) + + media_url = traverse_obj(media_data, ('link', {url_or_none})) or '' + has_watermark = media_url.endswith('-wm.mp4') + + formats = [{ + 'url': media_url, + 'ext': 'mp4', + 'format_note': 'Watermarked' if has_watermark else None, + 'preference': -10 if has_watermark else None + }] if media_url else [] + + if has_watermark: + formats.append({ + 'url': media_url.replace('-wm.mp4', '.mp4'), + 'ext': 'mp4' + }) + + audio_link = traverse_obj(media_data, ('audio', 'link')) + if audio_link: + formats.append({ + 'url': audio_link, + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none' + }) + + return { + 'id': video_id, + 'title': (json_ld.get('description') or self._html_search_meta(['og:title'], webpage) + or self._html_extract_title(webpage)), + 'formats': formats, + 'thumbnail': (media_data.get('thumbnail') + or self._html_search_meta(['og:image', 'og:image:secure_url'], webpage)), + 'description': (media_data.get('description') or json_ld.get('description') + or self._html_search_meta(['description', 'og:description'], webpage)), + 'timestamp': unified_timestamp(media_data.get('publishedAt') or json_ld.get('timestamp')), + 'view_count': int_or_none(media_data.get('viewsCount') or json_ld.get('view_count')), + 'like_count': int_or_none(media_data.get('likesCount')), + 'uploader': traverse_obj(media_data, ('creator', 'firstName')), + 'uploader_id': traverse_obj(media_data, ('creator', ('uuid', 'nickname')), get_all=False), + 'categories': traverse_obj(media_data, ('categories', ..., 'name')) or None, + 'repost_count': int_or_none(media_data.get('sharingCount')) + } From b25d6cb96337d479bdcb41768356da414c3aa835 Mon Sep 17 00:00:00 2001 From: Alex Ionescu Date: Fri, 17 Feb 2023 04:29:32 +0100 Subject: [PATCH 075/871] [utils] Fix race condition in `make_dir` (#6089) Authored by: aionescu --- yt_dlp/cache.py | 6 +----- yt_dlp/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index 7be91eae5..f8344fe77 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -39,11 +39,7 @@ def store(self, section, key, data, dtype='json'): fn = self._get_cache_fn(section, key, dtype) try: - try: - os.makedirs(os.path.dirname(fn)) - except OSError as ose: - if ose.errno != errno.EEXIST: - raise + os.makedirs(os.path.dirname(fn), exist_ok=True) self._ydl.write_debug(f'Saving {section}.{key} to cache') write_json_file({'yt-dlp_version': __version__, 'data': data}, fn) except Exception: diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 2d9e61c5b..736468aef 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5370,8 +5370,8 @@ def random_uuidv4(): def make_dir(path, to_screen=None): try: dn = os.path.dirname(path) - if dn and not os.path.exists(dn): - os.makedirs(dn) + if dn: + os.makedirs(dn, exist_ok=True) return True except OSError as err: if callable(to_screen) is not None: From a4ad59ff2ded208bf33f6fe07299a3449eadccdc Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:59:04 +0900 Subject: [PATCH 076/871] [extractor/anchorfm] Add episode extractor (#6092) Authored by: HobbyistDev, bashonly Closes #6081 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/anchorfm.py | 98 +++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 yt_dlp/extractor/anchorfm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4aab6ea78..6bba25506 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -102,6 +102,7 @@ AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, ) +from .anchorfm import AnchorFMEpisodeIE from .angel import AngelIE from .anvato import AnvatoIE from .aol import AolIE diff --git a/yt_dlp/extractor/anchorfm.py b/yt_dlp/extractor/anchorfm.py new file mode 100644 index 000000000..52f2ad057 --- /dev/null +++ b/yt_dlp/extractor/anchorfm.py @@ -0,0 +1,98 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + unified_timestamp +) + + +class AnchorFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://anchor\.fm/(?P\w+)/(?:embed/)?episodes/[\w-]+-(?P\w+)' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://anchor.fm/lovelyti/episodes/Chrisean-Rock-takes-to-twitter-to-announce-shes-pregnant--Blueface-denies-he-is-the-father-e1tpt3d', + 'info_dict': { + 'id': 'e1tpt3d', + 'ext': 'mp3', + 'title': ' Chrisean Rock takes to twitter to announce she\'s pregnant, Blueface denies he is the father!', + 'description': 'md5:207d167de3e28ceb4ddc1ebf5a30044c', + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_nologo/1034827/1034827-1658438968460-5f3bfdf3601e8.jpg', + 'duration': 624.718, + 'uploader': 'Lovelyti ', + 'uploader_id': '991541', + 'channel': 'lovelyti', + 'modified_date': '20230121', + 'modified_timestamp': 1674285178, + 'release_date': '20230121', + 'release_timestamp': 1674285179, + 'episode_id': 'e1tpt3d', + } + }, { + # embed url + 'url': 'https://anchor.fm/apakatatempo/embed/episodes/S2E75-Perang-Bintang-di-Balik-Kasus-Ferdy-Sambo-dan-Ismail-Bolong-e1shjqd', + 'info_dict': { + 'id': 'e1shjqd', + 'ext': 'mp3', + 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong', + 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41', + 'duration': 1042.008, + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg', + 'release_date': '20221221', + 'release_timestamp': 1671595916, + 'modified_date': '20221221', + 'modified_timestamp': 1671590834, + 'channel': 'apakatatempo', + 'uploader': 'Podcast Tempo', + 'uploader_id': '2585461', + 'season': 'Season 2', + 'season_number': 2, + 'episode_id': 'e1shjqd', + } + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://podcast.tempo.co/podcast/192/perang-bintang-di-balik-kasus-ferdy-sambo-dan-ismail-bolong', + 'info_dict': { + 'id': 'e1shjqd', + 'ext': 'mp3', + 'release_date': '20221221', + 'duration': 1042.008, + 'season': 'Season 2', + 'modified_timestamp': 1671590834, + 'uploader_id': '2585461', + 'modified_date': '20221221', + 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41', + 'season_number': 2, + 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong', + 'release_timestamp': 1671595916, + 'episode_id': 'e1shjqd', + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg', + 'uploader': 'Podcast Tempo', + 'channel': 'apakatatempo', + } + }] + + def _real_extract(self, url): + channel_name, episode_id = self._match_valid_url(url).group('channel_name', 'episode_id') + api_data = self._download_json(f'https://anchor.fm/api/v3/episodes/{episode_id}', episode_id) + + return { + 'id': episode_id, + 'title': traverse_obj(api_data, ('episode', 'title')), + 'url': traverse_obj(api_data, ('episode', 'episodeEnclosureUrl'), ('episodeAudios', 0, 'url')), + 'ext': 'mp3', + 'vcodec': 'none', + 'thumbnail': traverse_obj(api_data, ('episode', 'episodeImage')), + 'description': clean_html(traverse_obj(api_data, ('episode', ('description', 'descriptionPreview')), get_all=False)), + 'duration': float_or_none(traverse_obj(api_data, ('episode', 'duration')), 1000), + 'modified_timestamp': unified_timestamp(traverse_obj(api_data, ('episode', 'modified'))), + 'release_timestamp': int_or_none(traverse_obj(api_data, ('episode', 'publishOnUnixTimestamp'))), + 'episode_id': episode_id, + 'uploader': traverse_obj(api_data, ('creator', 'name')), + 'uploader_id': str_or_none(traverse_obj(api_data, ('creator', 'userId'))), + 'season_number': int_or_none(traverse_obj(api_data, ('episode', 'podcastSeasonNumber'))), + 'channel': channel_name or traverse_obj(api_data, ('creator', 'vanitySlug')), + } From 31c279a2a2c2ef402a9e6dad9992b310d16439a6 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 13:03:04 +0900 Subject: [PATCH 077/871] [extractor/hypergryph] Add extractor (#6094) Authored by: HobbyistDev, bashonly Closes #6052 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/hypergryph.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 yt_dlp/extractor/hypergryph.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6bba25506..70cb82277 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -747,6 +747,7 @@ HungamaAlbumPlaylistIE, ) from .hypem import HypemIE +from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE from .icareus import IcareusIE from .ichinanalive import ( diff --git a/yt_dlp/extractor/hypergryph.py b/yt_dlp/extractor/hypergryph.py new file mode 100644 index 000000000..9ca6caebc --- /dev/null +++ b/yt_dlp/extractor/hypergryph.py @@ -0,0 +1,32 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class MonsterSirenHypergryphMusicIE(InfoExtractor): + _VALID_URL = r'https?://monster-siren\.hypergryph\.com/music/(?P\d+)' + _TESTS = [{ + 'url': 'https://monster-siren.hypergryph.com/music/514562', + 'info_dict': { + 'id': '514562', + 'ext': 'wav', + 'artist': ['塞壬唱片-MSR'], + 'album': 'Flame Shadow', + 'title': 'Flame Shadow', + } + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + json_data = self._search_json( + r'window\.g_initialProps\s*=', webpage, 'data', audio_id, transform_source=js_to_json) + + return { + 'id': audio_id, + 'title': traverse_obj(json_data, ('player', 'songDetail', 'name')), + 'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')), + 'ext': 'wav', + 'vcodec': 'none', + 'artist': traverse_obj(json_data, ('player', 'songDetail', 'artists')), + 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')) + } From 5e1a54f63e393c218a40949012ff0de0ce63cb15 Mon Sep 17 00:00:00 2001 From: Friedrich Rehren Date: Fri, 17 Feb 2023 08:44:26 +0100 Subject: [PATCH 078/871] [extractor/SportDeutschland] Fix extractor (#6041) Authored by: FriedrichRehren Closes #3005 --- yt_dlp/extractor/sportdeutschland.py | 157 +++++++++++++++------------ 1 file changed, 86 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/sportdeutschland.py b/yt_dlp/extractor/sportdeutschland.py index 75074b310..6fc3ce9eb 100644 --- a/yt_dlp/extractor/sportdeutschland.py +++ b/yt_dlp/extractor/sportdeutschland.py @@ -1,95 +1,110 @@ from .common import InfoExtractor + from ..utils import ( - clean_html, - float_or_none, - int_or_none, - parse_iso8601, - parse_qs, - strip_or_none, - try_get, + format_field, + traverse_obj, + unified_timestamp, + strip_or_none ) class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'url': 'https://sportdeutschland.tv/blauweissbuchholztanzsport/buchholzer-formationswochenende-2023-samstag-1-bundesliga-landesliga', 'info_dict': { - 'id': '5318cac0275701382770543d7edaf0a0', + 'id': '983758e9-5829-454d-a3cf-eb27bccc3c94', 'ext': 'mp4', - 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', - 'duration': 16106.36, - }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, + 'title': 'Buchholzer Formationswochenende 2023 - Samstag - 1. Bundesliga / Landesliga', + 'description': 'md5:a288c794a5ee69e200d8f12982f81a87', + 'live_status': 'was_live', + 'channel': 'Blau-Weiss Buchholz Tanzsport', + 'channel_url': 'https://sportdeutschland.tv/blauweissbuchholztanzsport', + 'channel_id': '93ec33c9-48be-43b6-b404-e016b64fdfa3', + 'display_id': '9839a5c7-0dbb-48a8-ab63-3b408adc7b54', + 'duration': 32447, + 'upload_date': '20230114', + 'timestamp': 1673730018.0, + } }, { - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'url': 'https://sportdeutschland.tv/deutscherbadmintonverband/bwf-tour-1-runde-feld-1-yonex-gainward-german-open-2022-0', 'info_dict': { - 'id': 'c6e2fdd01f63013854c47054d2ab776f', - 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', - 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', - 'duration': 31397, - }, - 'playlist_count': 2, - }, { - 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', - 'only_matching': True, + 'id': '95b97d9a-04f6-4880-9039-182985c33943', + 'ext': 'mp4', + 'title': 'BWF Tour: 1. Runde Feld 1 - YONEX GAINWARD German Open 2022', + 'description': 'md5:2afb5996ceb9ac0b2ac81f563d3a883e', + 'live_status': 'was_live', + 'channel': 'Deutscher Badminton Verband', + 'channel_url': 'https://sportdeutschland.tv/deutscherbadmintonverband', + 'channel_id': '93ca5866-2551-49fc-8424-6db35af58920', + 'display_id': '95c80c52-6b9a-4ae9-9197-984145adfced', + 'duration': 41097, + 'upload_date': '20220309', + 'timestamp': 1646860727.0, + } }] def _real_extract(self, url): display_id = self._match_id(url) - data = self._download_json( - 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + meta = self._download_json( + 'https://api.sportdeutschland.tv/api/stateless/frontend/assets/' + display_id, display_id, query={'access_token': 'true'}) - asset = data['asset'] - title = (asset.get('title') or asset['label']).strip() - asset_id = asset.get('id') or asset.get('uuid') + + asset_id = traverse_obj(meta, 'id', 'uuid') + info = { 'id': asset_id, - 'title': title, - 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), - 'duration': int_or_none(asset.get('seconds')), + 'channel_url': format_field(meta, ('profile', 'slug'), 'https://sportdeutschland.tv/%s'), + **traverse_obj(meta, { + 'title': (('title', 'name'), {strip_or_none}), + 'description': 'description', + 'channel': ('profile', 'name'), + 'channel_id': ('profile', 'id'), + 'is_live': 'currently_live', + 'was_live': 'was_live' + }, get_all=False) } - videos = asset.get('videos') or [] - if len(videos) > 1: - playlist_id = parse_qs(url).get('playlistId', [None])[0] - if not self._yes_playlist(playlist_id, asset_id): - videos = [videos[int(playlist_id)]] - def entries(): - for i, video in enumerate(videos, 1): - video_id = video.get('uuid') - video_url = video.get('url') - if not (video_id and video_url): - continue - formats = self._extract_m3u8_formats( - video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) - if not formats and not self.get_param('ignore_no_formats'): - continue - yield { - 'id': video_id, - 'formats': formats, - 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), - 'duration': float_or_none(video.get('duration')), - } + videos = meta.get('videos') or [] + + if len(videos) > 1: info.update({ '_type': 'multi_video', - 'entries': entries(), - }) - else: - formats = self._extract_m3u8_formats( - videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') - section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) - info.update({ - 'formats': formats, - 'display_id': asset.get('permalink'), - 'thumbnail': try_get(asset, lambda x: x['images'][0]), - 'categories': [section_title] if section_title else None, - 'view_count': int_or_none(asset.get('views')), - 'is_live': asset.get('is_live') is True, - 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), - }) + 'entries': self.processVideoOrStream(asset_id, video) + } for video in enumerate(videos) if video.get('formats')) + + elif len(videos) == 1: + info.update( + self.processVideoOrStream(asset_id, videos[0]) + ) + + livestream = meta.get('livestream') + + if livestream is not None: + info.update( + self.processVideoOrStream(asset_id, livestream) + ) + return info + + def process_video_or_stream(self, asset_id, video): + video_id = video['id'] + video_src = video['src'] + video_type = video['type'] + + token = self._download_json( + f'https://api.sportdeutschland.tv/api/frontend/asset-token/{asset_id}', + video_id, query={'type': video_type, 'playback_id': video_src})['token'] + formats = self._extract_m3u8_formats(f'https://stream.mux.com/{video_src}.m3u8?token={token}', video_id) + + video_data = { + 'display_id': video_id, + 'formats': formats, + } + if video_type == 'mux_vod': + video_data.update({ + 'duration': video.get('duration'), + 'timestamp': unified_timestamp(video.get('created_at')) + }) + + return video_data From f737fb16d8234408c85bc189ccc926fea000515b Mon Sep 17 00:00:00 2001 From: Chris Caruso Date: Fri, 17 Feb 2023 00:06:15 -0800 Subject: [PATCH 079/871] [ExtractAudio] Handle outtmpl without ext (#6005) Authored by: carusocr Closes #5968 --- yt_dlp/__init__.py | 4 ---- yt_dlp/postprocessor/ffmpeg.py | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 255b31735..fb44303a2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -318,10 +318,6 @@ def validate_outtmpl(tmpl, msg): if outtmpl_default == '': opts.skip_download = None del opts.outtmpl['default'] - if outtmpl_default and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio: - raise ValueError( - 'Cannot download a video and extract audio into the same file! ' - f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') def parse_chapters(name, value): chapters, ranges = [], [] diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 5acd75376..123a95a3a 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -508,8 +508,7 @@ def run(self, information): if acodec != 'copy': more_opts = self._quality_args(acodec) - # not os.path.splitext, since the latter does not work on unicode in all setups - temp_path = new_path = f'{path.rpartition(".")[0]}.{extension}' + temp_path = new_path = replace_extension(path, extension, information['ext']) if new_path == path: if acodec == 'copy': From c61cf091a54d3aa3c611722035ccde5ecfe981bb Mon Sep 17 00:00:00 2001 From: bashonly Date: Fri, 17 Feb 2023 02:14:45 -0600 Subject: [PATCH 080/871] [extractor/youtube] `uploader_id` includes `@` with handle Authored by: bashonly --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4dde4bbaa..d891d92a3 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4120,7 +4120,7 @@ def is_bad_format(fmt): 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')), 'description': video_description, 'uploader': get_first(video_details, 'author'), - 'uploader_id': self._search_regex(r'/(?:channel/|user/|@)([^/?&#]+)', owner_profile_url, 'uploader id', default=None), + 'uploader_id': self._search_regex(r'/(?:channel/|user/|(?=@))([^/?&#]+)', owner_profile_url, 'uploader id', default=None), 'uploader_url': owner_profile_url, 'channel_id': channel_id, 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'), From d50ea3ce5abc3b0defc0e5d1e22b22ce9b01b07b Mon Sep 17 00:00:00 2001 From: Henrik Heimbuerger Date: Fri, 17 Feb 2023 09:32:55 +0100 Subject: [PATCH 081/871] [extractor/nebula] Remove broken cookie support (#5979) Authored by: hheimbuerger Closes #4002 --- yt_dlp/extractor/nebula.py | 119 +++++++++++-------------------------- 1 file changed, 35 insertions(+), 84 deletions(-) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 861fcb164..81e2f56e6 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,11 +1,9 @@ import itertools import json -import time import urllib.error -import urllib.parse from .common import InfoExtractor -from ..utils import ExtractorError, parse_iso8601, try_get +from ..utils import ExtractorError, parse_iso8601 _BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' @@ -15,11 +13,10 @@ class NebulaBaseIE(InfoExtractor): _nebula_api_token = None _nebula_bearer_token = None - _zype_access_token = None def _perform_nebula_auth(self, username, password): if not username or not password: - self.raise_login_required() + self.raise_login_required(method='password') data = json.dumps({'email': username, 'password': password}).encode('utf8') response = self._download_json( @@ -33,38 +30,10 @@ def _perform_nebula_auth(self, username, password): note='Logging in to Nebula with supplied credentials', errnote='Authentication failed or rejected') if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - urllib.parse.quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) + self.raise_login_required(method='password') return response['key'] - def _retrieve_nebula_api_token(self, username=None, password=None): - """ - Check cookie jar for valid token. Try to authenticate using credentials if no valid token - can be found in the cookie jar. - """ - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) - nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') - if nebula_api_token: - return nebula_api_token - - return self._perform_nebula_auth(username, password) - def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): assert method in ('GET', 'POST',) assert auth_type in ('api', 'bearer',) @@ -95,35 +64,24 @@ def _fetch_nebula_bearer_token(self): note='Authorizing to Nebula') return response['token'] - def _fetch_zype_access_token(self): - """ - Get a Zype access token, which is required to access video streams -- in our case: to - generate video URLs. - """ - user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token + def _fetch_video_formats(self, slug): + stream_info = self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/stream/', + video_id=slug, + auth_type='bearer', + note='Fetching video stream info') + manifest_url = stream_info['manifest'] + return self._extract_m3u8_formats_and_subtitles(manifest_url, slug) def _build_video_info(self, episode): - zype_id = episode['zype_id'] - zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + fmts, subs = self._fetch_video_formats(episode['slug']) channel_slug = episode['channel_slug'] + channel_title = episode['channel_title'] return { 'id': episode['zype_id'], 'display_id': episode['slug'], - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': zype_video_url, + 'formats': fmts, + 'subtitles': subs, + 'webpage_url': f'https://nebula.tv/{episode["slug"]}', 'title': episode['title'], 'description': episode['description'], 'timestamp': parse_iso8601(episode['published_at']), @@ -133,27 +91,26 @@ def _build_video_info(self, episode): 'height': key, } for key, tn in episode['assets']['thumbnail'].items()], 'duration': episode['duration'], - 'channel': episode['channel_title'], + 'channel': channel_title, 'channel_id': channel_slug, - 'channel_url': f'https://nebula.app/{channel_slug}', - 'uploader': episode['channel_title'], + 'channel_url': f'https://nebula.tv/{channel_slug}', + 'uploader': channel_title, 'uploader_id': channel_slug, - 'uploader_url': f'https://nebula.app/{channel_slug}', - 'series': episode['channel_title'], - 'creator': episode['channel_title'], + 'uploader_url': f'https://nebula.tv/{channel_slug}', + 'series': channel_title, + 'creator': channel_title, } def _perform_login(self, username=None, password=None): - self._nebula_api_token = self._retrieve_nebula_api_token(username, password) + self._nebula_api_token = self._perform_nebula_auth(username, password) self._nebula_bearer_token = self._fetch_nebula_bearer_token() - self._zype_access_token = self._fetch_zype_access_token() class NebulaIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P[-\w]+)' _TESTS = [ { - 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', + 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'md5': '14944cfee8c7beeea106320c47560efc', 'info_dict': { 'id': '5c271b40b13fd613090034fd', @@ -167,19 +124,17 @@ class NebulaIE(NebulaBaseIE): 'uploader': 'Lindsay Ellis', 'uploader_id': 'lindsayellis', 'timestamp': 1533009600, - 'uploader_url': 'https://nebula.app/lindsayellis', + 'uploader_url': 'https://nebula.tv/lindsayellis', 'series': 'Lindsay Ellis', - 'average_rating': int, 'display_id': 'that-time-disney-remade-beauty-and-the-beast', - 'channel_url': 'https://nebula.app/lindsayellis', + 'channel_url': 'https://nebula.tv/lindsayellis', 'creator': 'Lindsay Ellis', 'duration': 2212, - 'view_count': int, 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, { - 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'md5': 'd05739cf6c38c09322422f696b569c23', 'info_dict': { 'id': '5e7e78171aaf320001fbd6be', @@ -192,19 +147,17 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'realengineering', 'uploader': 'Real Engineering', 'uploader_id': 'realengineering', - 'view_count': int, 'series': 'Real Engineering', - 'average_rating': int, 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'creator': 'Real Engineering', 'duration': 841, - 'channel_url': 'https://nebula.app/realengineering', - 'uploader_url': 'https://nebula.app/realengineering', + 'channel_url': 'https://nebula.tv/realengineering', + 'uploader_url': 'https://nebula.tv/realengineering', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, { - 'url': 'https://nebula.app/videos/money-episode-1-the-draw', + 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', 'md5': 'ebe28a7ad822b9ee172387d860487868', 'info_dict': { 'id': '5e779ebdd157bc0001d1c75a', @@ -217,14 +170,12 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', 'uploader_id': 'tom-scott-presents-money', - 'uploader_url': 'https://nebula.app/tom-scott-presents-money', + 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', 'duration': 825, - 'channel_url': 'https://nebula.app/tom-scott-presents-money', - 'view_count': int, + 'channel_url': 'https://nebula.tv/tom-scott-presents-money', 'series': 'Tom Scott Presents: Money', 'display_id': 'money-episode-1-the-draw', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - 'average_rating': int, 'creator': 'Tom Scott Presents: Money', }, }, @@ -251,7 +202,7 @@ class NebulaSubscriptionsIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/myshows' _TESTS = [ { - 'url': 'https://nebula.app/myshows', + 'url': 'https://nebula.tv/myshows', 'playlist_mincount': 1, 'info_dict': { 'id': 'myshows', @@ -279,7 +230,7 @@ class NebulaChannelIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P[-\w]+)' _TESTS = [ { - 'url': 'https://nebula.app/tom-scott-presents-money', + 'url': 'https://nebula.tv/tom-scott-presents-money', 'info_dict': { 'id': 'tom-scott-presents-money', 'title': 'Tom Scott Presents: Money', @@ -287,13 +238,13 @@ class NebulaChannelIE(NebulaBaseIE): }, 'playlist_count': 5, }, { - 'url': 'https://nebula.app/lindsayellis', + 'url': 'https://nebula.tv/lindsayellis', 'info_dict': { 'id': 'lindsayellis', 'title': 'Lindsay Ellis', 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', }, - 'playlist_mincount': 100, + 'playlist_mincount': 2, }, ] From 9acca71237f42a4775008e51fe26e42f0a39c552 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 18:05:46 +0900 Subject: [PATCH 082/871] [extractor/boxcast] Add extractor (#5983) Authored by: HobbyistDev Closes #5769 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/boxcast.py | 102 ++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 yt_dlp/extractor/boxcast.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 70cb82277..797e5668a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -245,6 +245,7 @@ from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE +from .boxcast import BoxCastVideoIE from .booyah import BooyahClipsIE from .bpb import BpbIE from .br import ( diff --git a/yt_dlp/extractor/boxcast.py b/yt_dlp/extractor/boxcast.py new file mode 100644 index 000000000..51f9eb787 --- /dev/null +++ b/yt_dlp/extractor/boxcast.py @@ -0,0 +1,102 @@ +from .common import InfoExtractor +from ..utils import ( + js_to_json, + traverse_obj, + unified_timestamp +) + + +class BoxCastVideoIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://boxcast\.tv/(?: + view-embed/| + channel/\w+\?(?:[^#]+&)?b=| + video-portal/(?:\w+/){2} + )(?P[\w-]+)''' + _EMBED_REGEX = [r']+src=["\'](?Phttps?://boxcast\.tv/view-embed/[\w-]+)'] + _TESTS = [{ + 'url': 'https://boxcast.tv/view-embed/in-the-midst-of-darkness-light-prevails-an-interdisciplinary-symposium-ozmq5eclj50ujl4bmpwx', + 'info_dict': { + 'id': 'da1eqqgkacngd5djlqld', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$', + 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium', + 'release_timestamp': 1670686812, + 'release_date': '20221210', + 'uploader_id': 're8w0v8hohhvpqtbskpe', + 'uploader': 'Children\'s Health Defense', + } + }, { + 'url': 'https://boxcast.tv/video-portal/vctwevwntun3o0ikq7af/rvyblnn0fxbfjx5nwxhl/otbpltj2kzkveo2qz3ad', + 'info_dict': { + 'id': 'otbpltj2kzkveo2qz3ad', + 'ext': 'mp4', + 'uploader_id': 'vctwevwntun3o0ikq7af', + 'uploader': 'Legacy Christian Church', + 'title': 'The Quest | 1: Beginner\'s Bay | Jamie Schools', + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg' + } + }, { + 'url': 'https://boxcast.tv/channel/z03fqwaeaby5lnaawox2?b=ssihlw5gvfij2by8tkev', + 'info_dict': { + 'id': 'ssihlw5gvfij2by8tkev', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg$', + 'release_date': '20230101', + 'uploader_id': 'ds25vaazhlu4ygcvffid', + 'release_timestamp': 1672543201, + 'uploader': 'Lighthouse Ministries International - Beltsville, Maryland', + 'description': 'md5:ac23e3d01b0b0be592e8f7fe0ec3a340', + 'title': 'New Year\'s Eve CROSSOVER Service at LHMI | December 31, 2022', + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://childrenshealthdefense.eu/live-stream/', + 'info_dict': { + 'id': 'da1eqqgkacngd5djlqld', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$', + 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium', + 'release_timestamp': 1670686812, + 'release_date': '20221210', + 'uploader_id': 're8w0v8hohhvpqtbskpe', + 'uploader': 'Children\'s Health Defense', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + webpage_json_data = self._search_json( + r'var\s*BOXCAST_PRELOAD\s*=', webpage, 'broadcast data', display_id, + transform_source=js_to_json, default={}) + + # Ref: https://support.boxcast.com/en/articles/4235158-build-a-custom-viewer-experience-with-boxcast-api + broadcast_json_data = ( + traverse_obj(webpage_json_data, ('broadcast', 'data')) + or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}', display_id)) + view_json_data = ( + traverse_obj(webpage_json_data, ('view', 'data')) + or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}/view', + display_id, fatal=False) or {}) + + formats, subtitles = [], {} + if view_json_data.get('status') == 'recorded': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + view_json_data['playlist'], display_id) + + return { + 'id': str(broadcast_json_data['id']), + 'title': (broadcast_json_data.get('name') + or self._html_search_meta(['og:title', 'twitter:title'], webpage)), + 'description': (broadcast_json_data.get('description') + or self._html_search_meta(['og:description', 'twitter:description'], webpage) + or None), + 'thumbnail': (broadcast_json_data.get('preview') + or self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': unified_timestamp(broadcast_json_data.get('streamed_at')), + 'uploader': broadcast_json_data.get('account_name'), + 'uploader_id': broadcast_json_data.get('account_id'), + } From 30031be974d210f451100339699ef03b0ddb5f10 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 18:16:46 +0900 Subject: [PATCH 083/871] [extractor/tempo] Add IVXPlayer extractor (#5837) Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/tempo.py | 119 ++++++++++++++++++++++++-------- 2 files changed, 91 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 797e5668a..6dab2636b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1858,7 +1858,7 @@ ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE -from .tempo import TempoIE +from .tempo import TempoIE, IVXPlayerIE from .tencent import ( IflixEpisodeIE, IflixSeriesIE, diff --git a/yt_dlp/extractor/tempo.py b/yt_dlp/extractor/tempo.py index 1cfb956e5..9318d6f9a 100644 --- a/yt_dlp/extractor/tempo.py +++ b/yt_dlp/extractor/tempo.py @@ -1,5 +1,81 @@ +import re + from .common import InfoExtractor -from ..utils import int_or_none, parse_iso8601, str_or_none, traverse_obj +from ..utils import ( + int_or_none, + parse_iso8601, + traverse_obj, + try_call +) + + +class IVXPlayerIE(InfoExtractor): + _VALID_URL = r'ivxplayer:(?P\d+):(?P\w+)' + _TESTS = [{ + 'url': 'ivxplayer:2366065:4a89dfe6bc8f002596b1dfbd600730b1', + 'info_dict': { + 'id': '2366065', + 'ext': 'mp4', + 'duration': 112, + 'upload_date': '20221204', + 'title': 'Film Indonesia di Disney Content Showcase Asia Pacific 2022', + 'timestamp': 1670151746, + 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/2366065?width=300' + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.cantika.com/video/31737/film-indonesia-di-disney-content-showcase-asia-pacific-2022', + 'info_dict': { + 'id': '2374200', + 'ext': 'mp4', + 'duration': 110, + 'title': 'Serial Indonesia di Disney Content Showcase Asia Pacific 2022', + 'timestamp': 1670639416, + 'upload_date': '20221210', + 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/2374200?width=300' + } + }, { + 'url': 'https://www.gooto.com/video/11437/wuling-suv-ramai-dikunjungi-di-giias-2018', + 'info_dict': { + 'id': '892109', + 'ext': 'mp4', + 'title': 'Wuling SUV Ramai Dikunjungi di GIIAS 2018', + 'upload_date': '20180811', + 'description': 'md5:6d901483d0aacc664aecb4489719aafa', + 'duration': 75, + 'timestamp': 1534011263, + 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/892109?width=300' + } + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # more info at https://player.ivideosmart.com/ivsplayer/v4/dist/js/loader.js + mobj = re.search( + r']+data-ivs-key\s*=\s*"(?P[\w]+)\s*[^>]+\bdata-ivs-vid="(?P[\w-]+)', + webpage) + if mobj: + yield f'ivxplayer:{mobj.group("video_id")}:{mobj.group("player_key")}' + raise cls.StopExtraction() + + def _real_extract(self, url): + video_id, player_key = self._match_valid_url(url).group('video_id', 'player_key') + json_data = self._download_json( + f'https://ivxplayer.ivideosmart.com/prod/video/{video_id}?key={player_key}', video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + json_data['player']['video_url'], video_id) + + return { + 'id': str(json_data['ivx']['id']), + 'title': traverse_obj(json_data, ('ivx', 'name')), + 'description': traverse_obj(json_data, ('ivx', 'description')), + 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))), + 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'published_at'))), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': traverse_obj(json_data, ('ivx', 'thumbnail_url')) + } class TempoIE(InfoExtractor): @@ -7,14 +83,14 @@ class TempoIE(InfoExtractor): _TESTS = [{ 'url': 'https://video.tempo.co/read/30058/anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', 'info_dict': { - 'id': '2144438', + 'id': '2144275', + 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', 'ext': 'mp4', 'title': 'Anies Baswedan Ajukan Banding Putusan PTUN Batalkan UMP DKI', - 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', - 'duration': 84, + 'duration': 85, 'description': 'md5:a6822b7c4c874fa7e5bd63e96a387b66', 'thumbnail': 'https://statik.tempo.co/data/2022/07/27/id_1128287/1128287_720.jpg', - 'timestamp': 1658911277, + 'timestamp': 1658907970, 'upload_date': '20220727', 'tags': ['Anies Baswedan', ' PTUN', ' PTUN | Pengadilan Tata Usaha Negara', ' PTUN Batalkan UMP DKI', ' UMP DKI'], } @@ -24,30 +100,15 @@ def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_key, widget_id = self._search_regex( - r']+data-ivs-key\s*=\s*"(?P[\w]+)[^>]+\bdata-ivs-wid="(?P[\w-]+)', - webpage, 'player_key, widget_id', group=('player_key', 'widget_id')) + _, video_id, player_key = next(IVXPlayerIE._extract_embed_urls(url, webpage)).split(':') json_ld_data = self._search_json_ld(webpage, display_id) - json_data = self._download_json( - f'https://ivxplayer.ivideosmart.com/prod/widget/{widget_id}', - display_id, query={'key': player_key}) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - json_data['player']['video_url'], display_id, ext='mp4') - - return { - 'id': str(json_data['ivx']['id']), - 'display_id': display_id, - 'formats': formats, - 'subtitles': subtitles, - 'title': (self._html_search_meta('twitter:title', webpage) or self._og_search_title(webpage) - or traverse_obj(json_data, ('ivx', 'name'))), - 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))), - 'thumbnail': (self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage) - or traverse_obj(json_data, ('ivx', 'thumbnail_url'))), - 'description': (json_ld_data.get('description') or self._html_search_meta(['description', 'twitter:description'], webpage) - or self._og_search_description(webpage)), - 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'created_at'))), - 'tags': str_or_none(self._html_search_meta('keywords', webpage), '').split(','), - } + return self.url_result( + f'ivxplayer:{video_id}:{player_key}', display_id=display_id, + thumbnail=self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage), + tags=try_call(lambda: self._html_search_meta('keywords', webpage).split(',')), + description=(json_ld_data.get('description') + or self._html_search_meta(('description', 'twitter:description'), webpage) + or self._og_search_description(webpage)), + url_transparent=True) From a5387729696a5b33f53f60ef06f48e45663b12dd Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 17 Feb 2023 17:52:22 +0530 Subject: [PATCH 084/871] [cleanup] Misc Closes #5897 --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 4 +-- .../ISSUE_TEMPLATE/2_site_support_request.yml | 4 +-- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 4 +-- .github/ISSUE_TEMPLATE/4_bug_report.yml | 4 +-- .github/ISSUE_TEMPLATE/5_feature_request.yml | 4 +-- .github/ISSUE_TEMPLATE/6_question.yml | 4 +-- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml | 2 +- .../2_site_support_request.yml | 2 +- .../3_site_feature_request.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml | 2 +- .../ISSUE_TEMPLATE_tmpl/5_feature_request.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/6_question.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 2 +- CONTRIBUTORS | 2 +- Collaborators.md | 4 ++- README.md | 8 +++-- devscripts/make_issue_template.py | 2 +- yt_dlp/YoutubeDL.py | 4 ++- yt_dlp/cache.py | 1 - yt_dlp/extractor/amazonminitv.py | 3 +- yt_dlp/extractor/embedly.py | 29 +++++++++++++++++++ yt_dlp/extractor/radiko.py | 4 +-- yt_dlp/extractor/youtube.py | 13 --------- yt_dlp/options.py | 2 +- yt_dlp/postprocessor/metadataparser.py | 4 ++- yt_dlp/utils.py | 17 +++++++++-- 26 files changed, 84 insertions(+), 47 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index d116cd7c6..2237665e3 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -7,7 +7,7 @@ body: label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field required: true - type: checkboxes id: checklist @@ -24,7 +24,7 @@ body: required: true - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 2bbf93a93..0e2940d86 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -7,7 +7,7 @@ body: label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field required: true - type: checkboxes id: checklist @@ -24,7 +24,7 @@ body: required: true - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index d1d3514f2..92501be2e 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -7,7 +7,7 @@ body: label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field required: true - type: checkboxes id: checklist @@ -22,7 +22,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 8c851a945..bdfc0efb8 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -7,7 +7,7 @@ body: label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field required: true - type: checkboxes id: checklist @@ -24,7 +24,7 @@ body: required: true - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 444df3c32..c9e3aba38 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -7,7 +7,7 @@ body: label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field required: true - type: checkboxes id: checklist @@ -22,7 +22,7 @@ body: required: true - label: I've verified that I'm running yt-dlp version **2023.01.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 997278f21..fe6a4ee3f 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -7,7 +7,7 @@ body: label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field required: true - type: markdown attributes: @@ -28,7 +28,7 @@ body: required: true - label: I've verified that I'm running yt-dlp version **2023.01.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index e1b1e5138..85900e92e 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -18,7 +18,7 @@ body: required: true - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index 12a1c6598..75d62e7bb 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -18,7 +18,7 @@ body: required: true - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml index 2b46650f7..18b30f578 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -16,7 +16,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 377efbe33..90f59e70b 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -18,7 +18,7 @@ body: required: true - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml index 8bbc5d733..ef3bb2269 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -16,7 +16,7 @@ body: required: true - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml index ee09e82a3..4bef82d5a 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -22,7 +22,7 @@ body: required: true - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7c271565f..c4d3e812e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -30,7 +30,7 @@ ### Before submitting a *pull request* make sure you have: - [ ] [Searched](https://github.com/yt-dlp/yt-dlp/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests - [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) and [ran relevant tests](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) -### In order to be accepted and merged into yt-dlp each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: +### In order to be accepted and merged into yt-dlp each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check all of the following options that apply: - [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) - [ ] I am not the original author of this code but it is in public domain or released under [Unlicense](http://unlicense.org/) (provide reliable evidence) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 18fd70e4d..e3b95e2f3 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -4,6 +4,7 @@ coletdjnz/colethedj (collaborator) Ashish0804 (collaborator) nao20010128nao/Lesmiscore (collaborator) bashonly (collaborator) +Grub4K (collaborator) h-h-h-h pauldubois98 nixxo @@ -319,7 +320,6 @@ columndeeply DoubleCouponDay Fabi019 GautamMKGarg -Grub4K itachi-19 jeroenj josanabr diff --git a/Collaborators.md b/Collaborators.md index 3bce437c9..fe2a7f4b4 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -8,6 +8,7 @@ # Collaborators ## [pukkandan](https://github.com/pukkandan) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/pukkandan) +[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) * Owner of the fork @@ -25,8 +26,9 @@ ## [shirt](https://github.com/shirt-dev) ## [coletdjnz](https://github.com/coletdjnz) -[![gh-sponsor](https://img.shields.io/badge/_-Sponsor-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) +[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) +* Improved plugin architecture * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements * Added support for new websites YoutubeWebArchive, MainStreaming, PRX, nzherald, Mediaklikk, StarTV etc * Improved/fixed support for Patreon, panopto, gfycat, itv, pbs, SouthParkDE etc diff --git a/README.md b/README.md index 07c74d6c3..29a6c06fd 100644 --- a/README.md +++ b/README.md @@ -788,7 +788,7 @@ ## Workarounds: --prefer-insecure Use an unencrypted connection to retrieve information about the video (Currently supported only for YouTube) - --add-header FIELD:VALUE Specify a custom HTTP header and its value, + --add-headers FIELD:VALUE Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times --bidi-workaround Work around terminals that lack @@ -1511,7 +1511,7 @@ ## Sorting Formats - `source`: The preference of the source - `proto`: Protocol used for download (`https`/`ftps` > `http`/`ftp` > `m3u8_native`/`m3u8` > `http_dash_segments`> `websocket_frag` > `mms`/`rtsp` > `f4f`/`f4m`) - `vcodec`: Video Codec (`av01` > `vp9.2` > `vp9` > `h265` > `h264` > `vp8` > `h263` > `theora` > other) - - `acodec`: Audio Codec (`flac`/`alac` > `wav`/`aiff` > `opus` > `vorbis` > `aac` > `mp4a` > `mp3` `ac4` > > `eac3` > `ac3` > `dts` > other) + - `acodec`: Audio Codec (`flac`/`alac` > `wav`/`aiff` > `opus` > `vorbis` > `aac` > `mp4a` > `mp3` > `ac4` > `eac3` > `ac3` > `dts` > other) - `codec`: Equivalent to `vcodec,acodec` - `vext`: Video Extension (`mp4` > `mov` > `webm` > `flv` > other). If `--prefer-free-formats` is used, `webm` is preferred. - `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other). If `--prefer-free-formats` is used, the order changes to `ogg` > `opus` > `webm` > `mp3` > `m4a` > `aac` @@ -1741,6 +1741,8 @@ # EXTRACTOR ARGUMENTS Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=android_embedded,web;include_live_dash" --extractor-args "funimation:version=uncut"` +Note: In CLI, `ARG` can use `-` instead of `_`; e.g. `youtube:player-client"` becomes `youtube:player_client"` + The following extractors use this feature: #### youtube @@ -1887,7 +1889,7 @@ # EMBEDDING YT-DLP ydl.download(URLS) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L180). +Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L184). **Tip**: If you are porting your code from youtube-dl to yt-dlp, one important point to look out for is that we do not guarantee the return value of `YoutubeDL.extract_info` to be json serializable, or even be a dictionary. It will be dictionary-like, but if you want to ensure it is a serializable dictionary, pass it through `YoutubeDL.sanitize_info` as shown in the [example below](#extracting-information) diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index fd964c6c6..1ee00f2b8 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -58,7 +58,7 @@ label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE description: Fill all fields even if you think it is irrelevant for the issue options: - - label: I understand that I will be **blocked** if I remove or skip any mandatory\\* field + - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\\* field required: true '''.strip() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4e5c40b58..d6c5ce769 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3665,6 +3665,7 @@ def simplified_codec(f, field): format_field(f, 'asr', '\t%s', func=format_decimal_suffix), join_nonempty( self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, + self._format_out('DRM', 'light red') if f.get('has_drm') else None, format_field(f, 'language', '[%s]'), join_nonempty(format_field(f, 'format_note'), format_field(f, 'container', ignore=(None, f.get('ext'))), @@ -3764,12 +3765,13 @@ def get_encoding(stream): source = detect_variant() if VARIANT not in (None, 'pip'): source += '*' + klass = type(self) write_debug(join_nonempty( f'{"yt-dlp" if REPOSITORY == "yt-dlp/yt-dlp" else REPOSITORY} version', __version__, f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', - '' if _IN_CLI else 'API', + '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', delim=' ')) if not _IN_CLI: diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index f8344fe77..9dd4f2f25 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -1,5 +1,4 @@ import contextlib -import errno import json import os import re diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index 730996853..b57d985d1 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -191,7 +191,7 @@ def _real_extract(self, url): class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' - IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -250,6 +250,7 @@ def _real_extract(self, url): class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py index 1b58fca60..458aaa0a0 100644 --- a/yt_dlp/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py @@ -61,6 +61,35 @@ class EmbedlyIE(InfoExtractor): 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'http://www.permacultureetc.com/2022/12/comment-greffer-facilement-les-arbres-fruitiers.html', + 'info_dict': { + 'id': 'pfUK_ADTvgY', + 'ext': 'mp4', + 'title': 'Comment greffer facilement les arbres fruitiers ? (mois par mois)', + 'description': 'md5:d3a876995e522f138aabb48e040bfb4c', + 'view_count': int, + 'upload_date': '20221210', + 'comment_count': int, + 'live_status': 'not_live', + 'channel_id': 'UCsM4_jihNFYe4CtSkXvDR-Q', + 'channel_follower_count': int, + 'tags': ['permaculture', 'jardinage', 'dekarz', 'autonomie', 'greffe', 'fruitiers', 'arbres', 'jardin forêt', 'forêt comestible', 'damien'], + 'playable_in_embed': True, + 'uploader': 'permaculture agroécologie etc...', + 'channel': 'permaculture agroécologie etc...', + 'thumbnail': 'https://i.ytimg.com/vi/pfUK_ADTvgY/sddefault.jpg', + 'duration': 1526, + 'channel_url': 'https://www.youtube.com/channel/UCsM4_jihNFYe4CtSkXvDR-Q', + 'age_limit': 0, + 'uploader_id': 'permacultureetc', + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/permacultureetc', + 'categories': ['Education'], + 'availability': 'public', + }, + }] + @classmethod def _extract_from_webpage(cls, url, webpage): # Bypass "ie=cls" and suitable check diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index 43eecba5f..7fdf78283 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -133,9 +133,9 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, 'X-Radiko-AreaId': area_id, 'X-Radiko-AuthToken': auth_token, }) - not_preferred = is_onair and not pcu.startswith(self._HOSTS_FOR_LIVE) or (not is_onair and (pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED) or pcu.startswith(self._HOSTS_FOR_LIVE))) for sf in subformats: - if not_preferred: + if (is_onair ^ pcu.startswith(self._HOSTS_FOR_LIVE)) or ( + not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)): sf['preference'] = -100 sf['format_note'] = 'not preferred' if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d891d92a3..be82bc689 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4459,19 +4459,6 @@ def wrapper(self, url): return info_dict return wrapper - def _extract_channel_id(self, webpage): - channel_id = self._html_search_meta( - 'channelId', webpage, 'channel id', default=None) - if channel_id: - return channel_id - channel_url = self._html_search_meta( - ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', - 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', - 'twitter:app:url:googleplay'), webpage, 'channel url') - return self._search_regex( - r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', - channel_url, 'channel id') - @staticmethod def _extract_basic_item_renderer(item): # Modified from _extract_grid_item_renderer diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 68a3aecc4..fd60ff55f 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1031,7 +1031,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): metavar='URL', dest='referer', default=None, help=optparse.SUPPRESS_HELP) workarounds.add_option( - '--add-header', + '--add-headers', metavar='FIELD:VALUE', dest='headers', default={}, type='str', action='callback', callback=_dict_from_options_callback, callback_kwargs={'multiple_keys': False}, diff --git a/yt_dlp/postprocessor/metadataparser.py b/yt_dlp/postprocessor/metadataparser.py index f574f2330..1d6054294 100644 --- a/yt_dlp/postprocessor/metadataparser.py +++ b/yt_dlp/postprocessor/metadataparser.py @@ -1,7 +1,7 @@ import re from .common import PostProcessor -from ..utils import Namespace, filter_dict +from ..utils import Namespace, filter_dict, function_with_repr class MetadataParserPP(PostProcessor): @@ -60,6 +60,7 @@ def run(self, info): f(info) return [], info + @function_with_repr def interpretter(self, inp, out): def f(info): data_to_parse = self._downloader.evaluate_outtmpl(template, info) @@ -76,6 +77,7 @@ def f(info): out_re = re.compile(self.format_to_regex(out)) return f + @function_with_repr def replacer(self, field, search, replace): def f(info): val = info.get(field) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 736468aef..9eb9495a0 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3650,7 +3650,8 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): }, } - sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) + sanitize_codec = functools.partial( + try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower()) vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) for ext in preferences or COMPATIBLE_CODECS.keys(): @@ -3915,7 +3916,7 @@ def __eq__(self, other): and self.chapters == other.chapters and self.ranges == other.ranges) def __repr__(self): - return f'{type(self).__name__}({self.chapters}, {self.ranges})' + return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})' def parse_dfxp_time_expr(time_expr): @@ -6018,6 +6019,18 @@ def __get__(self, _, cls): return self._cache[cls] +class function_with_repr: + def __init__(self, func): + functools.update_wrapper(self, func) + self.func = func + + def __call__(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def __repr__(self): + return f'{self.func.__module__}.{self.func.__qualname__}' + + class Namespace(types.SimpleNamespace): """Immutable namespace""" From 45b2ee6f4fae139892a1a4335c269dcbb6671497 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 17 Feb 2023 16:51:34 +0530 Subject: [PATCH 085/871] Update to ytdl-commit-2dd6c6e [YouTube] Avoid crash if uploader_id extraction fails https://github.com/ytdl-org/youtube-dl/commit/2dd6c6edd8e0fc5e45865b8e6d865e35147de772 Except: * 295736c9cba714fb5de7d1c3dd31d86e50091cf8 [jsinterp] Improve parsing * 384f632e8a9b61e864a26678d85b2b39933b9bae [ITV] Overhaul ITV extractor * 33db85c571304bbd6863e3407ad8d08764c9e53b [feat]: Add support to external downloader aria2p --- README.md | 2 +- test/test_InfoExtractor.py | 2 + test/test_age_restriction.py | 19 +- yt_dlp/compat/_legacy.py | 30 ++- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/americastestkitchen.py | 78 +++++- yt_dlp/extractor/blerp.py | 167 ++++++++++++ yt_dlp/extractor/callin.py | 55 +++- yt_dlp/extractor/cammodels.py | 39 +-- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/ign.py | 337 +++++++++++++++++------- yt_dlp/extractor/kommunetv.py | 31 +++ yt_dlp/extractor/myvideoge.py | 68 +++-- yt_dlp/extractor/pr0gramm.py | 97 +++++++ yt_dlp/extractor/rbgtum.py | 93 +++++++ yt_dlp/extractor/unsupported.py | 3 + yt_dlp/extractor/vimeo.py | 55 ++-- yt_dlp/extractor/xhamster.py | 8 +- yt_dlp/utils.py | 28 +- 19 files changed, 911 insertions(+), 210 deletions(-) create mode 100644 yt_dlp/extractor/blerp.py create mode 100644 yt_dlp/extractor/kommunetv.py create mode 100644 yt_dlp/extractor/pr0gramm.py create mode 100644 yt_dlp/extractor/rbgtum.py diff --git a/README.md b/README.md index 29a6c06fd..9b91775bc 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ # NEW FEATURES -* Merged with **youtube-dl v2021.12.17+ [commit/195f22f](https://github.com/ytdl-org/youtube-dl/commit/195f22f)** and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) +* Merged with **youtube-dl v2021.12.17+ [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e)** ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 683ead315..e8d94a6ac 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -69,6 +69,7 @@ def test_opengraph(self): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -81,6 +82,7 @@ def test_opengraph(self): self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) + self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value') def test_html_search_meta(self): ie = self.ie diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index ff248432b..68107590e 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -10,6 +10,7 @@ from test.helper import is_download_test, try_rm from yt_dlp import YoutubeDL +from yt_dlp.utils import DownloadError def _download_restricted(url, filename, age): @@ -25,10 +26,14 @@ def _download_restricted(url, filename, age): ydl.add_default_info_extractors() json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) - ydl.download([url]) - res = os.path.exists(json_filename) - try_rm(json_filename) - return res + try: + ydl.download([url]) + except DownloadError: + pass + else: + return os.path.exists(json_filename) + finally: + try_rm(json_filename) @is_download_test @@ -38,12 +43,12 @@ def _assert_restricted(self, url, filename, age, old_age=None): self.assertFalse(_download_restricted(url, filename, age)) def test_youtube(self): - self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10) def test_youporn(self): self._assert_restricted( - 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - '505835.mp4', 2, old_age=25) + 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/', + '16715086.mp4', 2, old_age=25) if __name__ == '__main__': diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index d19333d31..84d749209 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -1,5 +1,6 @@ """ Do not use! """ +import base64 import collections import ctypes import getpass @@ -29,6 +30,7 @@ from re import Pattern as compat_Pattern # noqa: F401 from re import match as compat_Match # noqa: F401 +from . import compat_expanduser, compat_HTMLParseError, compat_realpath from .compat_utils import passthrough_module from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 from ..dependencies import brotli as compat_brotli # noqa: F401 @@ -47,23 +49,25 @@ def compat_setenv(key, value, env=os.environ): env[key] = value +compat_base64_b64decode = base64.b64decode compat_basestring = str compat_casefold = str.casefold compat_chr = chr compat_collections_abc = collections.abc -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = http.cookiejar.Cookie -compat_cookies = http.cookies -compat_cookies_SimpleCookie = http.cookies.SimpleCookie -compat_etree_Element = etree.Element -compat_etree_register_namespace = etree.register_namespace +compat_cookiejar = compat_http_cookiejar = http.cookiejar +compat_cookiejar_Cookie = compat_http_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies = compat_http_cookies = http.cookies +compat_cookies_SimpleCookie = compat_http_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_etree_Element = compat_xml_etree_ElementTree_Element = etree.Element +compat_etree_register_namespace = compat_xml_etree_register_namespace = etree.register_namespace compat_filter = filter compat_get_terminal_size = shutil.get_terminal_size compat_getenv = os.getenv -compat_getpass = getpass.getpass +compat_getpass = compat_getpass_getpass = getpass.getpass compat_html_entities = html.entities compat_html_entities_html5 = html.entities.html5 -compat_HTMLParser = html.parser.HTMLParser +compat_html_parser_HTMLParseError = compat_HTMLParseError +compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser compat_http_client = http.client compat_http_server = http.server compat_input = input @@ -72,6 +76,8 @@ def compat_setenv(key, value, env=os.environ): compat_kwargs = lambda kwargs: kwargs compat_map = map compat_numeric_types = (int, float, complex) +compat_os_path_expanduser = compat_expanduser +compat_os_path_realpath = compat_realpath compat_print = print compat_shlex_split = shlex.split compat_socket_create_connection = socket.create_connection @@ -81,7 +87,9 @@ def compat_setenv(key, value, env=os.environ): compat_subprocess_get_DEVNULL = lambda: DEVNULL compat_tokenize_tokenize = tokenize.tokenize compat_urllib_error = urllib.error +compat_urllib_HTTPError = urllib.error.HTTPError compat_urllib_parse = urllib.parse +compat_urllib_parse_parse_qs = urllib.parse.parse_qs compat_urllib_parse_quote = urllib.parse.quote compat_urllib_parse_quote_plus = urllib.parse.quote_plus compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus @@ -90,8 +98,10 @@ def compat_setenv(key, value, env=os.environ): compat_urllib_request = urllib.request compat_urllib_request_DataHandler = urllib.request.DataHandler compat_urllib_response = urllib.response -compat_urlretrieve = urllib.request.urlretrieve -compat_xml_parse_error = etree.ParseError +compat_urlretrieve = compat_urllib_request_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseError compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None + +legacy = [] diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 6dab2636b..a7bcafb4c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -239,6 +239,7 @@ BleacherReportIE, BleacherReportCMSIE, ) +from .blerp import BlerpIE from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE @@ -861,6 +862,7 @@ from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .kompas import KompasVideoIE from .konserthusetplay import KonserthusetPlayIE from .koo import KooIE @@ -1460,6 +1462,7 @@ PuhuTVIE, PuhuTVSerieIE, ) +from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE @@ -1521,6 +1524,10 @@ RayWenderlichCourseIE, ) from .rbmaradio import RBMARadioIE +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rcs import ( RCSIE, RCSEmbedsIE, diff --git a/yt_dlp/extractor/americastestkitchen.py b/yt_dlp/extractor/americastestkitchen.py index abda55dcf..e889458a2 100644 --- a/yt_dlp/extractor/americastestkitchen.py +++ b/yt_dlp/extractor/americastestkitchen.py @@ -11,7 +11,7 @@ class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -72,6 +72,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,7 +106,7 @@ def _real_extract(self, url): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P/cookscountry)?/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -117,29 +123,73 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_path, season_number = self._match_valid_url(url).group('show', 'id') - season_number = int(season_number) + season_number, show1, show = self._match_valid_url(url).group('season', 'show', 'show2') + show_path = ('/' + show) if show else '' + show = show or show1 + season_number = int_or_none(season_number) - slug = 'cco' if show_path == '/cookscountry' else 'atk' + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - season = 'Season %d' % season_number + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] + + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ + playlist_id, headers={ 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) @@ -162,4 +212,4 @@ def entries(): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) diff --git a/yt_dlp/extractor/blerp.py b/yt_dlp/extractor/blerp.py new file mode 100644 index 000000000..4631ad2e9 --- /dev/null +++ b/yt_dlp/extractor/blerp.py @@ -0,0 +1,167 @@ +import json + +from .common import InfoExtractor +from ..utils import strip_or_none, traverse_obj + + +class BlerpIE(InfoExtractor): + IE_NAME = 'blerp' + _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', + 'info_dict': { + 'id': '6320fe8745636cb4dd677a5a', + 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', + 'uploader': 'luminousaj', + 'uploader_id': '5fb81e51aa66ae000c395478', + 'ext': 'mp3', + 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], + } + }, { + 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', + 'info_dict': { + 'id': '5bc94ef4796001000498429f', + 'title': 'Yee', + 'uploader': '179617322678353920', + 'uploader_id': '5ba99cf71386730004552c42', + 'ext': 'mp3', + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] + } + }] + + _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_QUERY = ( + '''query webBitePageGetBite($_id: MongoID!) { + web { + biteById(_id: $_id) { + ...bitePageFrag + __typename + } + __typename + } + } + + fragment bitePageFrag on Bite { + _id + title + userKeywords + keywords + color + visibility + isPremium + owned + price + extraReview + isAudioExists + image { + filename + original { + url + __typename + } + __typename + } + userReactions { + _id + reactions + createdAt + __typename + } + topReactions + totalSaveCount + saved + blerpLibraryType + license + licenseMetaData + playCount + totalShareCount + totalFavoriteCount + totalAddedToBoardCount + userCategory + userAudioQuality + audioCreationState + transcription + userTranscription + description + createdAt + updatedAt + author + listingType + ownerObject { + _id + username + profileImage { + filename + original { + url + __typename + } + __typename + } + __typename + } + transcription + favorited + visibility + isCurated + sourceUrl + audienceRating + strictAudienceRating + ownerId + reportObject { + reportedContentStatus + __typename + } + giphy { + mp4 + gif + __typename + } + audio { + filename + original { + url + __typename + } + mp3 { + url + __typename + } + __typename + } + __typename + } + + ''') + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = { + 'operationName': self._GRAPHQL_OPERATIONNAME, + 'query': self._GRAPHQL_QUERY, + 'variables': { + '_id': audio_id + } + } + + headers = { + 'Content-Type': 'application/json' + } + + json_result = self._download_json('https://api.blerp.com/graphql', + audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + + bite_json = json_result['data']['web']['biteById'] + + info_dict = { + 'id': bite_json['_id'], + 'url': bite_json['audio']['mp3']['url'], + 'title': bite_json['title'], + 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), + 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), + 'ext': 'mp3', + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + } + + return info_dict diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index e9668763e..c77179c7b 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - traverse_obj, - float_or_none, - int_or_none -) +from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj class CallinIE(InfoExtractor): @@ -35,6 +31,54 @@ class CallinIE(InfoExtractor): 'episode_number': 1, 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' } + }, { + 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'md5': '14ede27ee2c957b7e4db93140fc0745c', + 'info_dict': { + 'id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'ext': 'ts', + 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'description': 'Or, why the government doesn’t like SpaceX', + 'channel': 'The Pull Request', + 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', + 'duration': 3182.472, + 'series_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'uploader_url': 'http://thepullrequest.com', + 'upload_date': '20220902', + 'episode': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'display_id': 'fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'series': 'The Pull Request', + 'channel_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'view_count': int, + 'uploader': 'Antonio García Martínez', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png', + 'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'timestamp': 1662100688.005, + } + }, { + 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', + 'md5': '16f704ddbf82a27e3930533b12062f07', + 'info_dict': { + 'id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'ext': 'ts', + 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', + 'channel': 'The DEBRIEF With Briahna Joy Gray', + 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', + 'duration': 10043.16, + 'series_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'uploader_url': 'http://patreon.com/badfaithpodcast', + 'upload_date': '20220826', + 'episode': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'display_id': 'episode-', + 'series': 'The DEBRIEF With Briahna Joy Gray', + 'channel_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'view_count': int, + 'uploader': 'Briahna Gray', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png', + 'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'timestamp': 1661476708.282, + } }] def try_get_user_name(self, d): @@ -86,6 +130,7 @@ def _real_extract(self, url): return { 'id': id, + '_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])], 'display_id': display_id, 'title': title, 'formats': formats, diff --git a/yt_dlp/extractor/cammodels.py b/yt_dlp/extractor/cammodels.py index 0509057fc..135b31529 100644 --- a/yt_dlp/extractor/cammodels.py +++ b/yt_dlp/extractor/cammodels.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, -) +from ..utils import int_or_none, url_or_none class CamModelsIE(InfoExtractor): @@ -17,32 +13,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -82,12 +57,20 @@ def _real_extract(self, url): 'quality': -10, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) return { 'id': user_id, 'title': user_id, + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b7c687bc3..ebacc87bc 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1338,7 +1338,7 @@ def _get_tfa_info(self, note='two-factor verification code'): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py index d4797d35e..e4db7f9fa 100644 --- a/yt_dlp/extractor/ign.py +++ b/yt_dlp/extractor/ign.py @@ -1,17 +1,20 @@ import re +import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) +from ..compat import compat_parse_qs from ..utils import ( - HEADRequest, + ExtractorError, determine_ext, + error_to_compat_str, + extract_attributes, int_or_none, + merge_dicts, parse_iso8601, strip_or_none, - try_get, + traverse_obj, + url_or_none, + urljoin, ) @@ -20,14 +23,90 @@ def _call_api(self, slug): return self._download_json( 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + raise + + def _extract_video_info(self, video, fatal=True): + video_id = video['videoId'] + + formats = [] + refs = traverse_obj(video, 'refs', expected_type=dict) or {} + + m3u8_url = url_or_none(refs.get('m3uUrl')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = url_or_none(refs.get('f4mUrl')) + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + for asset in (video.get('assets') or []): + asset_url = url_or_none(asset.get('url')) + if not asset_url: + continue + formats.append({ + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + + mezzanine_url = traverse_obj( + video, ('system', 'mezzanineUrl'), expected_type=url_or_none) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'quality': 1, + 'url': mezzanine_url, + }) + + thumbnails = traverse_obj( + video, ('thumbnails', ..., {'url': 'url'}), expected_type=url_or_none) + tags = traverse_obj( + video, ('tags', ..., 'displayName'), + expected_type=lambda x: x.strip() or None) + + metadata = traverse_obj(video, 'metadata', expected_type=dict) or {} + title = traverse_obj( + metadata, 'longTitle', 'title', 'name', + expected_type=lambda x: x.strip() or None) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + 'tags': tags, + } + class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' + _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P.+?)' + _PLAYLIST_PATH_RE = r'(?:/?\?(?P[^&#]+))?' + _VALID_URL = ( + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' + % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) IE_NAME = 'ign.com' _PAGE_TYPE = 'video' @@ -42,7 +121,13 @@ class IGNIE(IGNBaseIE): 'timestamp': 1370440800, 'upload_date': '20130605', 'tags': 'count:9', - } + 'display_id': 'the-last-of-us-review', + 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2014/03/26/lastofusreviewmimig2.jpg', + 'duration': 440, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', @@ -54,84 +139,48 @@ class IGNIE(IGNBaseIE): 'timestamp': 1420571160, 'upload_date': '20150106', 'tags': 'count:4', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + grids = re.findall( + r'''(?s)]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)]*>''', + webpage) + return filter(None, + (urljoin(url, m.group('path')) for m in re.finditer( + r''']+\bhref\s*=\s*('|")(?P/videos%s)\1''' + % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + def _real_extract(self, url): - display_id = self._match_id(url) - video = self._call_api(display_id) - video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] + display_id, filt = self._match_valid_url(url).group('id', 'filt') + if display_id: + return self._extract_video(url, display_id) + return self._extract_playlist(url, filt or 'all') - formats = [] - refs = video.get('refs') or {} + def _extract_playlist(self, url, display_id): + webpage = self._download_webpage(url, display_id) - m3u8_url = refs.get('m3uUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + return self.playlist_result( + (self.url_result(u, self.ie_key()) + for u in self._extract_embed_urls(url, webpage)), + playlist_id=display_id) - f4m_url = refs.get('f4mUrl') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) + def _extract_video(self, url, display_id): + video = self._checked_call_api(display_id) - for asset in (video.get('assets') or []): - asset_url = asset.get('url') - if not asset_url: - continue - formats.append({ - 'url': asset_url, - 'tbr': int_or_none(asset.get('bitrate'), 1000), - 'fps': int_or_none(asset.get('frame_rate')), - 'height': int_or_none(asset.get('height')), - 'width': int_or_none(asset.get('width')), - }) + info = self._extract_video_info(video) - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) - if mezzanine_url: - formats.append({ - 'ext': determine_ext(mezzanine_url, 'mp4'), - 'format_id': 'mezzanine', - 'quality': 1, - 'url': mezzanine_url, - }) - - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) - - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(metadata.get('description')), - 'timestamp': parse_iso8601(metadata.get('publishDate')), - 'duration': int_or_none(metadata.get('duration')), + return merge_dicts({ 'display_id': display_id, - 'thumbnails': thumbnails, - 'formats': formats, - 'tags': tags, - } + }, info) -class IGNVideoIE(InfoExtractor): +class IGNVideoIE(IGNBaseIE): _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', @@ -143,7 +192,16 @@ class IGNVideoIE(InfoExtractor): 'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'timestamp': 1444665600, 'upload_date': '20151012', - } + 'display_id': '112203', + 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', + 'duration': 298, + 'tags': 'count:13', + 'display_id': '112203', + 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', + 'duration': 298, + 'tags': 'count:13', + }, + 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, @@ -163,22 +221,38 @@ class IGNVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() + parsed_url = urllib.parse.urlparse(url) + embed_url = urllib.parse.urlunparse( + parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) + + webpage, urlh = self._download_webpage_handle(embed_url, video_id) + new_url = urlh.geturl() ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + urllib.parse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) + video = self._search_regex(r'(]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False) + if not video: + if new_url == url: + raise ExtractorError('Redirect loop: ' + url) + return self.url_result(new_url) + video = extract_attributes(video) + video_data = video.get('data-settings') or '{}' + video_data = self._parse_json(video_data, video_id)['video'] + info = self._extract_video_info(video_data) + + return merge_dicts({ + 'display_id': video_id, + }, info) class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P[^/?&#]+)' _PAGE_TYPE = 'article' _TESTS = [{ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': '524497489e4e8ff5848ece34', + 'id': '72113', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', }, 'playlist': [ @@ -186,34 +260,43 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', - 'title': 'GTA 5 Video Review', + 'title': 'Grand Theft Auto V Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', + 'tags': 'count:12', + 'thumbnail': 'https://assets1.ignimgs.com/thumbs/userUploaded/2021/8/16/gta-v-heistsjpg-e94705-1629138553533.jpeg', + 'display_id': 'grand-theft-auto-v-video-review', + 'duration': 501, }, }, { 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'title': 'GTA 5 In Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', + 'duration': 202, + 'tags': 'count:25', + 'display_id': 'gta-5-in-slow-motion', + 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2013/11/03/GTA-SLO-MO-1.jpg', }, }, ], 'params': { - 'playlist_items': '2-3', 'skip_download': True, }, + 'expected_warnings': ['Backend fetch failed'], }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { 'id': '53ee806780a81ec46e0790f8', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', }, - 'playlist_count': 2, + 'playlist_count': 1, + 'expected_warnings': ['Backend fetch failed'], }, { # videoId pattern 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', @@ -236,18 +319,84 @@ class IGNArticleIE(IGNBaseIE): 'only_matching': True, }] + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError): + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + if e.cause.code == 404: + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + elif e.cause.code == 503: + self.report_warning(error_to_compat_str(e.cause)) + return + raise + def _real_extract(self, url): display_id = self._match_id(url) - article = self._call_api(display_id) + article = self._checked_call_api(display_id) - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): - yield self.url_result(video_url) + if article: + # obsolete ? + def entries(): + media_url = traverse_obj( + article, ('mediaRelations', 0, 'media', 'metadata', 'url'), + expected_type=url_or_none) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + if url_or_none(video_url): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + traverse_obj( + article, ('metadata', 'headline'), + expected_type=lambda x: x.strip() or None)) + + webpage = self._download_webpage(url, display_id) + + playlist_id = self._html_search_meta('dable:item_id', webpage, default=None) + if playlist_id: + + def entries(): + for m in re.finditer( + r'''(?s)]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P.+?)]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', + m.group('params'), 'flashvars', default='') + flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + v_url = url_or_none((flashvars.get('url') or [None])[-1]) + if v_url: + yield self.url_result(v_url) + else: + playlist_id = self._search_regex( + r'''\bdata-post-id\s*=\s*("|')(?P[\da-f]+)\1''', + webpage, 'id', group='id', default=None) + + nextjs_data = self._search_nextjs_data(webpage, display_id) + + def entries(): + for player in traverse_obj( + nextjs_data, + ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')): + # skip promo links (which may not always be served, eg GH CI servers) + if traverse_obj(nextjs_data, + ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')), + expected_type=dict): + continue + video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {} + info = self._extract_video_info(video, fatal=False) + if info: + yield merge_dicts({ + 'display_id': display_id, + }, info) return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) + entries(), playlist_id or display_id, + re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None) diff --git a/yt_dlp/extractor/kommunetv.py b/yt_dlp/extractor/kommunetv.py new file mode 100644 index 000000000..e21e556be --- /dev/null +++ b/yt_dlp/extractor/kommunetv.py @@ -0,0 +1,31 @@ +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } diff --git a/yt_dlp/extractor/myvideoge.py b/yt_dlp/extractor/myvideoge.py index 513d4cb77..64cee48e7 100644 --- a/yt_dlp/extractor/myvideoge.py +++ b/yt_dlp/extractor/myvideoge.py @@ -1,5 +1,16 @@ +import re + from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + MONTH_NAMES, + clean_html, + get_element_by_class, + get_element_by_id, + int_or_none, + js_to_json, + qualities, + unified_strdate, +) class MyVideoGeIE(InfoExtractor): @@ -11,37 +22,50 @@ class MyVideoGeIE(InfoExtractor): 'id': '3941048', 'ext': 'mp4', 'title': 'The best prikol', + 'upload_date': '20200611', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8', - 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3' - } + 'uploader': 'chixa33', + 'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3', + }, } + _MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი'] + + _quality = staticmethod(qualities(('SD', 'HD'))) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r']*>([^<]+)', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._html_search_meta(['og:image'], webpage) - uploader = self._search_regex(r']+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + title = ( + self._og_search_title(webpage, default=None) + or clean_html(get_element_by_class('my_video_title', webpage)) + or self._html_search_regex(r']*>([^<]+)]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + + upload_date = get_element_by_class('mv_vid_upl_date', webpage) + # as ka locale may not be present roll a local date conversion + upload_date = (unified_strdate( + # translate any ka month to an en one + re.sub('|'.join(self._MONTH_NAMES_KA), + lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))], + upload_date, re.I)) + if upload_date else None) return { 'id': video_id, @@ -49,5 +73,9 @@ def _formats_key(f): 'description': description, 'uploader': uploader, 'formats': formats, - 'thumbnail': thumbnail + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)), + 'like_count': int_or_none(get_element_by_id('likes_count', webpage)), + 'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)), } diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py new file mode 100644 index 000000000..2eb327fba --- /dev/null +++ b/yt_dlp/extractor/pr0gramm.py @@ -0,0 +1,97 @@ +import re + +from .common import InfoExtractor +from ..utils import merge_dicts + + +class Pr0grammStaticIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/static/5466437 + _VALID_URL = r'https?://pr0gramm\.com/static/(?P[0-9]+)' + _TEST = { + 'url': 'https://pr0gramm.com/static/5466437', + 'md5': '52fa540d70d3edc286846f8ca85938aa', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Fetch media sources + entries = self._parse_html5_media_entries(url, webpage, video_id) + media_info = entries[0] + + # Fetch author + uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') + + # Fetch approx upload timestamp from filename + # Have None-defaults in case the extraction fails + uploadDay = None + uploadMon = None + uploadYear = None + uploadTimestr = None + # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) + m = re.search(r'//img\.pr0gramm\.com/(?P[\d]+)/(?P[\d]+)/(?P[\d]+)/\w+\.\w{,4}', webpage) + + if (m): + # Up to a day of accuracy should suffice... + uploadDay = m.groupdict().get('day') + uploadMon = m.groupdict().get('mon') + uploadYear = m.groupdict().get('year') + uploadTimestr = uploadYear + uploadMon + uploadDay + + return merge_dicts({ + 'id': video_id, + 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), + 'uploader': uploader, + 'upload_date': uploadTimestr + }, media_info) + + +# This extractor is for the primary url (used for sharing, and appears in the +# location bar) Since this page loads the DOM via JS, yt-dl can't find any +# video information here. So let's redirect to a compatibility version of +# the site, which does contain the