From f352a0977879a6210b1519036fc75e9d423f277c Mon Sep 17 00:00:00 2001 From: Marcel Date: Sun, 20 Nov 2022 14:12:23 +0530 Subject: [PATCH 001/153] [webvtt] Handle premature EOF Closes #2867, closes #5600 Authored by: flashdagger --- yt_dlp/webvtt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index 1138865ba3..dd72982778 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -93,7 +93,7 @@ def __init__(self, parser): ([0-9]{3})? ''') _REGEX_EOF = re.compile(r'\Z') -_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') +_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)') _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') From 3b021eacefab4a9e43660d72d6d5a49f7ddb025e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 21 Nov 2022 00:51:45 +0000 Subject: [PATCH 002/153] [extractor/generic] Add `fragment_query` extractor arg for DASH and HLS (#5528) * `fragment_query`: passthrough any query in generic mpd/m3u8 manifest URLs to their fragments * Add support for `extra_param_to_segment_url` to DASH downloader Authored by: bashonly, pukkandan --- README.md | 3 +++ yt_dlp/downloader/dash.py | 14 +++++++++++--- yt_dlp/extractor/generic.py | 18 +++++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f336dcb6ac..fa55d130bb 100644 --- a/README.md +++ b/README.md @@ -1736,6 +1736,9 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off +#### generic +* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg + #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` * `version`: The video version to extract - `uncut` or `simulcast` diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index 8723e10689..4328d739c2 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -1,8 +1,9 @@ import time +import urllib.parse from . import get_suitable_downloader from .fragment import FragmentFD -from ..utils import urljoin +from ..utils import update_url_query, urljoin class DashSegmentsFD(FragmentFD): @@ -40,7 +41,12 @@ def real_download(self, filename, info_dict): self._prepare_and_start_frag_download(ctx, fmt) ctx['start'] = real_start - fragments_to_download = self._get_fragments(fmt, ctx) + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + + fragments_to_download = self._get_fragments(fmt, ctx, extra_query) if real_downloader: self.to_screen( @@ -57,7 +63,7 @@ def _resolve_fragments(self, fragments, ctx): fragments = fragments(ctx) if callable(fragments) else fragments return [next(iter(fragments))] if self.params.get('test') else fragments - def _get_fragments(self, fmt, ctx): + def _get_fragments(self, fmt, ctx, extra_query): fragment_base_url = fmt.get('fragment_base_url') fragments = self._resolve_fragments(fmt['fragments'], ctx) @@ -70,6 +76,8 @@ def _get_fragments(self, fmt, ctx): if not fragment_url: assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) + if extra_query: + fragment_url = update_url_query(fragment_url, extra_query) yield { 'frag_index': frag_index, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 5da77273d8..2fcbc6f43f 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2189,6 +2189,13 @@ def report_detected(self, name, num=1, note=None): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') + def _fragment_query(self, url): + if self._configuration_arg('fragment_query'): + query_string = urllib.parse.urlparse(url).query + if query_string: + return {'extra_param_to_segment_url': query_string} + return {} + def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', @@ -2351,8 +2358,10 @@ def _real_extract(self, url): subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: @@ -2379,6 +2388,7 @@ def _real_extract(self, url): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + info_dict.update(self._fragment_query(url)) return info_dict # Maybe it's a direct link to a video? @@ -2429,6 +2439,7 @@ def _real_extract(self, url): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + info_dict.update(self._fragment_query(url)) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2541,7 +2552,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - else: + for fmt in formats: + fmt.update(self._fragment_query(src)) + + if not formats: formats.append({ 'url': src, 'ext': (mimetype2ext(src_type) @@ -2776,8 +2790,10 @@ def filter_video(urls): return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: From 7ff2fafe47aa9978f89ff358a8b9f9261430f33a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 21 Nov 2022 00:55:57 +0000 Subject: [PATCH 003/153] [extractor/vimeo] Add `VimeoProIE` (#5596) * Add support for VimeoPro URLs not containing a Vimeo video ID * Add support for password-protected VimeoPro pages Closes #5594 Authored by: bashonly, pukkandan --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/vimeo.py | 132 +++++++++++++++++++++----------- 2 files changed, 90 insertions(+), 43 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c1ab5a9640..a3c5472f0e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2096,6 +2096,7 @@ VimeoGroupsIE, VimeoLikesIE, VimeoOndemandIE, + VimeoProIE, VimeoReviewIE, VimeoUserIE, VimeoWatchLaterIE, diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 26fe566b03..97b99fc509 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -2,6 +2,7 @@ import functools import re import itertools +import urllib.error from .common import InfoExtractor from ..compat import ( @@ -311,7 +312,7 @@ class VimeoIE(VimeoBaseInfoExtractor): ) \. )? - vimeo(?:pro)?\.com/ + vimeo\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:[^/]+/)*? (?: @@ -355,31 +356,6 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'skip': 'No longer available' }, - { - 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', - 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', - 'note': 'Vimeo Pro video (#1197)', - 'info_dict': { - 'id': '68093876', - 'ext': 'mp4', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', - 'uploader_id': 'openstreetmapus', - 'uploader': 'OpenStreetMap US', - 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', - 'duration': 1595, - 'upload_date': '20130610', - 'timestamp': 1370893156, - 'license': 'by', - 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', - 'view_count': int, - 'comment_count': int, - 'like_count': int, - }, - 'params': { - 'format': 'best[protocol=https]', - }, - }, { 'url': 'http://player.vimeo.com/video/54469442', 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd', @@ -837,15 +813,7 @@ def _real_extract(self, url): if unlisted_hash: return self._extract_from_api(video_id, unlisted_hash) - orig_url = url - is_pro = 'vimeopro.com/' in url - if is_pro: - # some videos require portfolio_id to be present in player url - # https://github.com/ytdl-org/youtube-dl/issues/20070 - url = self._extract_url(url, self._download_webpage(url, video_id)) - if not url: - url = 'https://vimeo.com/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): + if any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id self._try_album_password(url) @@ -947,14 +915,6 @@ def is_rented(): video_description = self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage, default=None) - if not video_description and is_pro: - orig_webpage = self._download_webpage( - orig_url, video_id, - note='Downloading webpage for description', - fatal=False) - if orig_webpage: - video_description = self._html_search_meta( - 'description', orig_webpage, default=None) if not video_description: self.report_warning('Cannot find video description') @@ -1393,3 +1353,89 @@ def _real_extract(self, url): info = self._parse_config(config, video_id) info['id'] = video_id return info + + +class VimeoProIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:pro' + _VALID_URL = r'https?://(?:www\.)?vimeopro\.com/[^/?#]+/(?P[^/?#]+)(?:(?:/videos?/(?P[0-9]+)))?' + _TESTS = [{ + # Vimeo URL derived from video_id + 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', + 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', + 'note': 'Vimeo Pro video (#1197)', + 'info_dict': { + 'id': '68093876', + 'ext': 'mp4', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', + 'uploader_id': 'openstreetmapus', + 'uploader': 'OpenStreetMap US', + 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'duration': 1595, + 'upload_date': '20130610', + 'timestamp': 1370893156, + 'license': 'by', + 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'tags': 'count:1', + }, + 'params': { + 'format': 'best[protocol=https]', + }, + }, { + # password-protected VimeoPro page with Vimeo player embed + 'url': 'https://vimeopro.com/cadfem/simulation-conference-mechanische-systeme-in-perfektion', + 'info_dict': { + 'id': '764543723', + 'ext': 'mp4', + 'title': 'Mechanische Systeme in Perfektion: Realität erfassen, Innovation treiben', + 'thumbnail': 'https://i.vimeocdn.com/video/1543784598-a1a750494a485e601110136b9fe11e28c2131942452b3a5d30391cb3800ca8fd-d_1280', + 'description': 'md5:2a9d195cd1b0f6f79827107dc88c2420', + 'uploader': 'CADFEM', + 'uploader_id': 'cadfem', + 'uploader_url': 'https://vimeo.com/cadfem', + 'duration': 12505, + 'chapters': 'count:10', + }, + 'params': { + 'videopassword': 'Conference2022', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).group('slug', 'id') + if video_id: + display_id = video_id + webpage = self._download_webpage(url, display_id) + + password_form = self._search_regex( + r'(?is)]+?method=["\']post["\'][^>]*>(.+?password.+?)', + webpage, 'password form', default=None) + if password_form: + try: + webpage = self._download_webpage(url, display_id, data=urlencode_postdata({ + 'password': self._get_video_password(), + **self._hidden_inputs(password_form), + }), note='Logging in with video password') + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 418: + raise ExtractorError('Wrong video password', expected=True) + raise + + description = None + # even if we have video_id, some videos require player URL with portfolio_id query param + # https://github.com/ytdl-org/youtube-dl/issues/20070 + vimeo_url = VimeoIE._extract_url(url, webpage) + if vimeo_url: + description = self._html_search_meta('description', webpage, default=None) + elif video_id: + vimeo_url = f'https://vimeo.com/{video_id}' + else: + raise ExtractorError( + 'No Vimeo embed or video ID could be found in VimeoPro page', expected=True) + + return self.url_result(vimeo_url, VimeoIE, video_id, url_transparent=True, + description=description) From 27c0f899c8f4a71e2ec8ac7ee4ab0217da7934bd Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 22 Nov 2022 00:40:02 +0000 Subject: [PATCH 004/153] [extractor/screencastify] Add extractor (#5604) Closes #5603 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/screencastify.py | 52 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 yt_dlp/extractor/screencastify.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a3c5472f0e..375ac0d066 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1603,6 +1603,7 @@ from .sbs import SBSIE from .screen9 import Screen9IE from .screencast import ScreencastIE +from .screencastify import ScreencastifyIE from .screencastomatic import ScreencastOMaticIE from .scrippsnetworks import ( ScrippsNetworksWatchIE, diff --git a/yt_dlp/extractor/screencastify.py b/yt_dlp/extractor/screencastify.py new file mode 100644 index 0000000000..136b8479bc --- /dev/null +++ b/yt_dlp/extractor/screencastify.py @@ -0,0 +1,52 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import traverse_obj, update_url_query + + +class ScreencastifyIE(InfoExtractor): + _VALID_URL = r'https?://watch\.screencastify\.com/v/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://watch.screencastify.com/v/sYVkZip3quLKhHw4Ybk8', + 'info_dict': { + 'id': 'sYVkZip3quLKhHw4Ybk8', + 'ext': 'mp4', + 'title': 'Inserting and Aligning the Case Top and Bottom', + 'description': '', + 'uploader': 'Paul Gunn', + 'extra_param_to_segment_url': str, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + f'https://umbrella.svc.screencastify.com/api/umbrellaService/watch/{video_id}', video_id) + + query_string = traverse_obj(info, ('manifest', 'auth', 'query')) + query = urllib.parse.parse_qs(query_string) + formats = [] + dash_manifest_url = traverse_obj(info, ('manifest', 'url')) + if dash_manifest_url: + formats.extend( + self._extract_mpd_formats( + dash_manifest_url, video_id, mpd_id='dash', query=query, fatal=False)) + hls_manifest_url = traverse_obj(info, ('manifest', 'hlsUrl')) + if hls_manifest_url: + formats.extend( + self._extract_m3u8_formats( + hls_manifest_url, video_id, ext='mp4', m3u8_id='hls', query=query, fatal=False)) + for f in formats: + f['url'] = update_url_query(f['url'], query) + + return { + 'id': video_id, + 'title': info.get('title'), + 'description': info.get('description'), + 'uploader': info.get('userName'), + 'formats': formats, + 'extra_param_to_segment_url': query_string, + } From d761dfd059ded109b4feef7315bd84f7d47c6bd7 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 22 Nov 2022 03:42:16 +0000 Subject: [PATCH 005/153] [extractor/naver] Improve `_VALID_URL` for `NaverNowIE` (#5620) Authored by: bashonly --- yt_dlp/extractor/naver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index b5425c7448..9de83abf76 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -254,7 +254,7 @@ def _extract_video_info(self, video_id, url): class NaverNowIE(NaverBaseIE): IE_NAME = 'navernow' - _VALID_URL = r'https?://now\.naver\.com/s/now\.(?P[0-9]+)' + _VALID_URL = r'https?://now\.naver\.com/s/now\.(?P\w+)' _API_URL = 'https://apis.naver.com/now_web/oldnow_web/v4' _TESTS = [{ 'url': 'https://now.naver.com/s/now.4759?shareReplayId=26331132#replay=', @@ -313,6 +313,9 @@ class NaverNowIE(NaverBaseIE): 'title': '아이키의 떰즈업', }, 'playlist_mincount': 101, + }, { + 'url': 'https://now.naver.com/s/now.kihyunplay?shareReplayId=30573291#replay', + 'only_matching': True, }] def _extract_replay(self, show_id, replay_id): From 9d52bf65ff38386a70493ce152f0883476b0709b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Tue, 22 Nov 2022 20:09:57 +0200 Subject: [PATCH 006/153] [extractor/kanal2] Add extractor (#5575) Authored by: glensc, pukkandan, bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/kanal2.py | 66 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 yt_dlp/extractor/kanal2.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 375ac0d066..9d5af491b6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -820,6 +820,7 @@ from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE +from .kanal2 import Kanal2IE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE diff --git a/yt_dlp/extractor/kanal2.py b/yt_dlp/extractor/kanal2.py new file mode 100644 index 0000000000..3c0efe5981 --- /dev/null +++ b/yt_dlp/extractor/kanal2.py @@ -0,0 +1,66 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + join_nonempty, + traverse_obj, + unified_timestamp, + update_url_query, +) + + +class Kanal2IE(InfoExtractor): + _VALID_URL = r'https?://kanal2\.postimees\.ee/[^?#]+\?([^#]+&)?id=(?P\d+)' + _TESTS = [{ + 'note': 'Test standard url (#5575)', + 'url': 'https://kanal2.postimees.ee/pluss/video/?id=40792', + 'md5': '7ea7b16266ec1798743777df241883dd', + 'info_dict': { + 'id': '40792', + 'ext': 'mp4', + 'title': 'Aedniku aabits / Osa 53 (05.08.2016 20:00)', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'md5:53cabf3c5d73150d594747f727431248', + 'upload_date': '20160805', + 'timestamp': 1470420000, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + playlist = self._download_json( + f'https://kanal2.postimees.ee/player/playlist/{video_id}', + video_id, query={'type': 'episodes'}, + headers={'X-Requested-With': 'XMLHttpRequest'}) + + return { + 'id': video_id, + 'title': join_nonempty(*traverse_obj(playlist, ('info', ('title', 'subtitle'))), delim=' / '), + 'description': traverse_obj(playlist, ('info', 'description')), + 'thumbnail': traverse_obj(playlist, ('data', 'image')), + 'formats': self.get_formats(playlist, video_id), + 'timestamp': unified_timestamp(self._search_regex( + r'\((\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2})\)$', + traverse_obj(playlist, ('info', 'subtitle')), 'timestamp', default='') + ' +0200'), + } + + def get_formats(self, playlist, video_id): + path = traverse_obj(playlist, ('data', 'path')) + if not path: + raise ExtractorError('Path value not found in playlist JSON response') + session = self._download_json( + 'https://sts.postimees.ee/session/register', + video_id, note='Creating session', errnote='Error creating session', + headers={ + 'X-Original-URI': path, + 'Accept': 'application/json', + }) + if session.get('reason') != 'OK' or not session.get('session'): + reason = session.get('reason', 'unknown error') + raise ExtractorError(f'Unable to obtain session: {reason}') + + formats = [] + for stream in traverse_obj(playlist, ('data', 'streams', ..., 'file')): + formats.extend(self._extract_m3u8_formats( + update_url_query(stream, {'s': session['session']}), video_id, 'mp4')) + + return formats From 0d95d8b00ad1bf879ed61f4e588753ef87ccd061 Mon Sep 17 00:00:00 2001 From: Mudassir Chapra <37051110+muddi900@users.noreply.github.com> Date: Thu, 24 Nov 2022 20:34:45 +0500 Subject: [PATCH 007/153] [extractor/gronkh] Fix `_VALID_URL` (#5628) Closes #5531 Authored by: muddi900 --- yt_dlp/extractor/gronkh.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py index b6cf141174..b9370e36c1 100644 --- a/yt_dlp/extractor/gronkh.py +++ b/yt_dlp/extractor/gronkh.py @@ -9,15 +9,26 @@ class GronkhIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?streams?/(?P\d+)' _TESTS = [{ + 'url': 'https://gronkh.tv/streams/657', + 'info_dict': { + 'id': '657', + 'ext': 'mp4', + 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', + 'view_count': int, + 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', + 'upload_date': '20221111' + }, + 'params': {'skip_download': True} + }, { 'url': 'https://gronkh.tv/stream/536', 'info_dict': { 'id': '536', 'ext': 'mp4', 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', - 'view_count': 19491, + 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', 'upload_date': '20211001' }, From c0caa805157fb315d4b24ea4e1f3eef0210c2096 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 25 Nov 2022 16:10:23 +0530 Subject: [PATCH 008/153] [extractor/naver] Treat fan subtitles as separate language Closes #5467 --- yt_dlp/extractor/naver.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 9de83abf76..e2e6e9728c 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -8,6 +8,7 @@ clean_html, dict_get, int_or_none, + join_nonempty, merge_dicts, parse_duration, traverse_obj, @@ -72,13 +73,11 @@ def extract_formats(streams, stream_type, query={}): def get_subs(caption_url): if re.search(self._CAPTION_EXT_RE, caption_url): - return [{ - 'url': replace_ext(caption_url, 'ttml'), - }, { - 'url': replace_ext(caption_url, 'vtt'), - }] - else: - return [{'url': caption_url}] + return [ + replace_ext(caption_url, 'ttml'), + replace_ext(caption_url, 'vtt'), + ] + return [caption_url] automatic_captions = {} subtitles = {} @@ -87,7 +86,13 @@ def get_subs(caption_url): if not caption_url: continue sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles - sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url)) + lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und' + if caption.get('type') == 'fan': + lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in sub_dict) + sub_dict.setdefault(lang, []).extend({ + 'url': sub_url, + 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '), + } for sub_url in get_subs(caption_url)) user = meta.get('user', {}) From 86f557b636cf2dc66cd882a88ae4338086c48fbb Mon Sep 17 00:00:00 2001 From: marieell Date: Sat, 26 Nov 2022 03:30:25 +0100 Subject: [PATCH 009/153] [extractor/youporn] Fix metadata (#2768) Authored by: marieell --- yt_dlp/extractor/youporn.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 2f3f213324..8f1b9911b3 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -4,6 +4,7 @@ from ..utils import ( extract_attributes, int_or_none, + merge_dicts, str_to_int, unified_strdate, url_or_none, @@ -64,6 +65,24 @@ class YouPornIE(InfoExtractor): }, { 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/', + 'info_dict': { + 'id': '16290308', + 'age_limit': 18, + 'categories': [], + 'description': 'md5:00ea70f642f431c379763c17c2f396bc', + 'display_id': 'tinderspecial-trailer1', + 'duration': 298.0, + 'ext': 'mp4', + 'upload_date': '20201123', + 'uploader': 'Ersties', + 'tags': [], + 'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg', + 'timestamp': 1606089600, + 'title': 'Tinder In Real Life', + 'view_count': int, + } }] def _real_extract(self, url): @@ -159,7 +178,8 @@ def extract_tag_box(regex, title): r'(?s)Tags:.*?\s*]+class=["\']tagBoxContent["\'][^>]*>(.+?)', 'tags') - return { + data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) + return merge_dicts(data, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -174,4 +194,4 @@ def extract_tag_box(regex, title): 'tags': tags, 'age_limit': age_limit, 'formats': formats, - } + }) From 48652590ec401f4e747a5e51552cdcac20744aa1 Mon Sep 17 00:00:00 2001 From: alexia Date: Mon, 28 Nov 2022 03:36:18 +0100 Subject: [PATCH 010/153] [extractor/amazonminitv] Add extractors (#3628) Authored by: nyuszika7h, GautamMKGarg --- yt_dlp/extractor/_extractors.py | 5 + yt_dlp/extractor/amazonminitv.py | 322 +++++++++++++++++++++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 yt_dlp/extractor/amazonminitv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9d5af491b6..2fe15f6d28 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -87,6 +87,11 @@ ) from .amcnetworks import AMCNetworksIE from .amazon import AmazonStoreIE +from .amazonminitv import ( + AmazonMiniTVIE, + AmazonMiniTVSeasonIE, + AmazonMiniTVSeriesIE, +) from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py new file mode 100644 index 0000000000..793fac2e4d --- /dev/null +++ b/yt_dlp/extractor/amazonminitv.py @@ -0,0 +1,322 @@ +import json + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, traverse_obj, try_get + + +class AmazonMiniTVIE(InfoExtractor): + _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P[a-f0-9-]+)' + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36', + } + _CLIENT_ID = 'ATVIN' + _DEVICE_LOCALE = 'en_GB' + _TESTS = [{ + 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'md5': '0045a5ea38dddd4de5a5fcec7274b476', + 'info_dict': { + 'id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + 'ext': 'mp4', + 'title': 'May I Kiss You?', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:a549bfc747973e04feb707833474e59d', + 'release_timestamp': 1644710400, + 'release_date': '20220213', + 'duration': 846, + 'chapters': [{ + 'start_time': 815.0, + 'end_time': 846, + 'title': 'End Credits', + }], + 'series': 'Couple Goals', + 'series_id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'season': 'Season 3', + 'season_number': 3, + 'season_id': 'amzn1.dv.gti.20331016-d9b9-4968-b991-c89fa4927a36', + 'episode': 'May I Kiss You?', + 'episode_number': 2, + 'episode_id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'md5': '9a977bffd5d99c4dd2a32b360aee1863', + 'info_dict': { + 'id': 'amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'ext': 'mp4', + 'title': 'Jahaan', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:05eb765a77bf703f322f120ec6867339', + 'release_timestamp': 1647475200, + 'release_date': '20220317', + 'duration': 783, + 'chapters': [], + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }] + _GRAPHQL_QUERY_CONTENT = ''' +query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { + content( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + contentId: $contentId + contentType: $contentType + ) { + contentId + name + ... on Episode { + contentId + vodType + name + images + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + audioTracks + seasonId + seriesId + seriesName + seasonNumber + episodeNumber + timecode { + endCreditsTime + } + } + ... on MovieContent { + contentId + vodType + name + description { + synopsis + contentLengthInSeconds + } + images + publicReleaseDateUTC + audioTracks + } + } +}''' + + def _call_api(self, asin, data=None, note=None): + query = {} + headers = self._HEADERS.copy() + if data: + name = 'graphql' + data['variables'].update({ + 'clientId': self._CLIENT_ID, + 'contentType': 'VOD', + 'deviceLocale': self._DEVICE_LOCALE, + 'sessionIdToken': self.session_id, + }) + headers.update({'Content-Type': 'application/json'}) + else: + name = 'prs' + query.update({ + 'clientId': self._CLIENT_ID, + 'deviceType': 'A1WMMUXPCUJL4N', + 'contentId': asin, + 'deviceLocale': self._DEVICE_LOCALE, + }) + + resp = self._download_json( + f'https://www.amazon.in/minitv/api/web/{name}', + asin, query=query, data=json.dumps(data).encode() if data else None, + headers=headers, note=note) + + if 'errors' in resp: + raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}') + + if data: + resp = resp['data'][data['operationName']] + return resp + + def _real_initialize(self): + # Download webpage to get the required guest session cookies + self._download_webpage( + 'https://www.amazon.in/minitv', + None, + headers=self._HEADERS, + note='Downloading webpage') + + self.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + + title_info = self._call_api( + asin, data={ + 'operationName': 'content', + 'variables': { + 'contentId': asin, + }, + 'query': self._GRAPHQL_QUERY_CONTENT, + }, + note='Downloading title info') + + prs = self._call_api(asin, note='Downloading playback info') + + formats = [] + subtitles = {} + for type_, asset in prs['playbackAssets'].items(): + if not isinstance(asset, dict): + continue + if type_ == 'hls': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + asset['manifestUrl'], asin, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=type_, fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif type_ == 'dash': + mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles( + asset['manifestUrl'], asin, mpd_id=type_, fatal=False) + formats.extend(mpd_fmts) + subtitles = self._merge_subtitles(subtitles, mpd_subs) + + duration = traverse_obj(title_info, ('description', 'contentLengthInSeconds')) + credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) + chapters = [{ + 'start_time': credits_time, + 'end_time': duration + credits_time, # FIXME: I suppose this is correct + 'title': 'End Credits', + }] if credits_time and duration else [] + is_episode = title_info.get('vodType') == 'EPISODE' + + return { + 'id': asin, + 'title': title_info.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'language': traverse_obj(title_info, ('audioTracks', 0)), + 'thumbnails': [{ + 'id': type_, + 'url': url, + } for type_, url in (title_info.get('images') or {}).items()], + 'description': traverse_obj(title_info, ('description', 'synopsis')), + 'release_timestamp': int_or_none(try_get(title_info, lambda x: x['publicReleaseDateUTC'] / 1000)), + 'duration': duration, + 'chapters': chapters, + 'series': title_info.get('seriesName'), + 'series_id': title_info.get('seriesId'), + 'season_number': title_info.get('seasonNumber'), + 'season_id': title_info.get('seasonId'), + 'episode': title_info.get('name') if is_episode else None, + 'episode_number': title_info.get('episodeNumber'), + 'episode_id': asin if is_episode else None, + } + + +class AmazonMiniTVSeasonIE(AmazonMiniTVIE): + IE_NAME = 'amazonminitv:season' + _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + _TESTS = [{ + 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'playlist_mincount': 6, + 'info_dict': { + 'id': 'amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + }, + }, { + 'url': 'amazonminitv:season:0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'only_matching': True, + }] + _GRAPHQL_QUERY = ''' +query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonId: ID!, $deviceLocale: String) { + getEpisodes( + applicationContextInput: {sessionIdToken: $sessionIdToken, deviceLocale: $deviceLocale, clientId: $clientId} + episodeOrSeasonId: $episodeOrSeasonId + ) { + episodes { + ... on Episode { + contentId + name + images + seriesName + seasonId + seriesId + seasonNumber + episodeNumber + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + } + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, + data={ + 'operationName': 'getEpisodes', + 'variables': { + 'episodeOrSeasonId': asin, + }, + 'query': self._GRAPHQL_QUERY, + }, + note='Downloading season info') + + for episode in season_info['episodes']: + yield self.url_result(f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), playlist_id=asin) + + +class AmazonMiniTVSeriesIE(AmazonMiniTVIE): + IE_NAME = 'amazonminitv:series' + _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + }, + }, { + 'url': 'amazonminitv:series:56521d46-b040-4fd5-872e-3e70476a04b0', + 'only_matching': True, + }] + _GRAPHQL_QUERY = ''' +query getSeasons($sessionIdToken: String!, $deviceLocale: String, $episodeOrSeasonOrSeriesId: ID!, $clientId: String) { + getSeasons( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + episodeOrSeasonOrSeriesId: $episodeOrSeasonOrSeriesId + ) { + seasons { + seasonId + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, + data={ + 'operationName': 'getSeasons', + 'variables': { + 'episodeOrSeasonOrSeriesId': asin, + }, + 'query': self._GRAPHQL_QUERY, + }, + note='Downloading series info') + + for season in season_info['seasons']: + yield self.url_result(f'amazonminitv:season:{season["seasonId"]}', AmazonMiniTVSeasonIE, season['seasonId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), playlist_id=asin) From a9d069f5b8540f15caaf696bc39ce6a969f8b11c Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 29 Nov 2022 07:50:58 +0530 Subject: [PATCH 011/153] [extractor/amazonminitv] Cleanup 48652590ec401f4e747a5e51552cdcac20744aa1 --- yt_dlp/extractor/amazonminitv.py | 162 +++++++++++++------------------ 1 file changed, 65 insertions(+), 97 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index 793fac2e4d..7309968537 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -4,16 +4,43 @@ from ..utils import ExtractorError, int_or_none, traverse_obj, try_get -class AmazonMiniTVIE(InfoExtractor): +class AmazonMiniTVBaseIE(InfoExtractor): + def _real_initialize(self): + self._download_webpage( + 'https://www.amazon.in/minitv', None, + note='Fetching guest session cookies') + AmazonMiniTVBaseIE.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value + + def _call_api(self, asin, data=None, note=None): + device = {'clientId': 'ATVIN', 'deviceLocale': 'en_GB'} + if data: + data['variables'].update({ + 'contentType': 'VOD', + 'sessionIdToken': self.session_id, + **device, + }) + + resp = self._download_json( + f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', + asin, note=note, headers={'Content-Type': 'application/json'}, + data=json.dumps(data).encode() if data else None, + query=None if data else { + 'deviceType': 'A1WMMUXPCUJL4N', + 'contentId': asin, + **device, + }) + + if resp.get('errors'): + raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}') + elif not data: + return resp + return resp['data'][data['operationName']] + + +class AmazonMiniTVIE(AmazonMiniTVBaseIE): _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P[a-f0-9-]+)' - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36', - } - _CLIENT_ID = 'ATVIN' - _DEVICE_LOCALE = 'en_GB' _TESTS = [{ 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', - 'md5': '0045a5ea38dddd4de5a5fcec7274b476', 'info_dict': { 'id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', 'ext': 'mp4', @@ -24,11 +51,7 @@ class AmazonMiniTVIE(InfoExtractor): 'release_timestamp': 1644710400, 'release_date': '20220213', 'duration': 846, - 'chapters': [{ - 'start_time': 815.0, - 'end_time': 846, - 'title': 'End Credits', - }], + 'chapters': 'count:2', 'series': 'Couple Goals', 'series_id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'season': 'Season 3', @@ -40,7 +63,6 @@ class AmazonMiniTVIE(InfoExtractor): }, }, { 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', - 'md5': '9a977bffd5d99c4dd2a32b360aee1863', 'info_dict': { 'id': 'amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', 'ext': 'mp4', @@ -63,6 +85,7 @@ class AmazonMiniTVIE(InfoExtractor): 'url': 'amazonminitv:280d2564-584f-452f-9c98-7baf906e01ab', 'only_matching': True, }] + _GRAPHQL_QUERY_CONTENT = ''' query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { content( @@ -107,68 +130,13 @@ class AmazonMiniTVIE(InfoExtractor): } }''' - def _call_api(self, asin, data=None, note=None): - query = {} - headers = self._HEADERS.copy() - if data: - name = 'graphql' - data['variables'].update({ - 'clientId': self._CLIENT_ID, - 'contentType': 'VOD', - 'deviceLocale': self._DEVICE_LOCALE, - 'sessionIdToken': self.session_id, - }) - headers.update({'Content-Type': 'application/json'}) - else: - name = 'prs' - query.update({ - 'clientId': self._CLIENT_ID, - 'deviceType': 'A1WMMUXPCUJL4N', - 'contentId': asin, - 'deviceLocale': self._DEVICE_LOCALE, - }) - - resp = self._download_json( - f'https://www.amazon.in/minitv/api/web/{name}', - asin, query=query, data=json.dumps(data).encode() if data else None, - headers=headers, note=note) - - if 'errors' in resp: - raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}') - - if data: - resp = resp['data'][data['operationName']] - return resp - - def _real_initialize(self): - # Download webpage to get the required guest session cookies - self._download_webpage( - 'https://www.amazon.in/minitv', - None, - headers=self._HEADERS, - note='Downloading webpage') - - self.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value - def _real_extract(self, url): asin = f'amzn1.dv.gti.{self._match_id(url)}' - - title_info = self._call_api( - asin, data={ - 'operationName': 'content', - 'variables': { - 'contentId': asin, - }, - 'query': self._GRAPHQL_QUERY_CONTENT, - }, - note='Downloading title info') - prs = self._call_api(asin, note='Downloading playback info') - formats = [] - subtitles = {} + formats, subtitles = [], {} for type_, asset in prs['playbackAssets'].items(): - if not isinstance(asset, dict): + if not traverse_obj(asset, 'manifestUrl'): continue if type_ == 'hls': m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( @@ -181,14 +149,16 @@ def _real_extract(self, url): asset['manifestUrl'], asin, mpd_id=type_, fatal=False) formats.extend(mpd_fmts) subtitles = self._merge_subtitles(subtitles, mpd_subs) + else: + self.report_warning(f'Unknown asset type: {type_}') - duration = traverse_obj(title_info, ('description', 'contentLengthInSeconds')) + title_info = self._call_api( + asin, note='Downloading title info', data={ + 'operationName': 'content', + 'variables': {'contentId': asin}, + 'query': self._GRAPHQL_QUERY_CONTENT, + }) credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) - chapters = [{ - 'start_time': credits_time, - 'end_time': duration + credits_time, # FIXME: I suppose this is correct - 'title': 'End Credits', - }] if credits_time and duration else [] is_episode = title_info.get('vodType') == 'EPISODE' return { @@ -203,8 +173,11 @@ def _real_extract(self, url): } for type_, url in (title_info.get('images') or {}).items()], 'description': traverse_obj(title_info, ('description', 'synopsis')), 'release_timestamp': int_or_none(try_get(title_info, lambda x: x['publicReleaseDateUTC'] / 1000)), - 'duration': duration, - 'chapters': chapters, + 'duration': traverse_obj(title_info, ('description', 'contentLengthInSeconds')), + 'chapters': [{ + 'start_time': credits_time, + 'title': 'End Credits', + }] if credits_time else [], 'series': title_info.get('seriesName'), 'series_id': title_info.get('seriesId'), 'season_number': title_info.get('seasonNumber'), @@ -215,7 +188,7 @@ def _real_extract(self, url): } -class AmazonMiniTVSeasonIE(AmazonMiniTVIE): +class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' @@ -229,6 +202,7 @@ class AmazonMiniTVSeasonIE(AmazonMiniTVIE): 'url': 'amazonminitv:season:0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'only_matching': True, }] + _GRAPHQL_QUERY = ''' query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonId: ID!, $deviceLocale: String) { getEpisodes( @@ -258,25 +232,22 @@ class AmazonMiniTVSeasonIE(AmazonMiniTVIE): def _entries(self, asin): season_info = self._call_api( - asin, - data={ + asin, note='Downloading season info', data={ 'operationName': 'getEpisodes', - 'variables': { - 'episodeOrSeasonId': asin, - }, + 'variables': {'episodeOrSeasonId': asin}, 'query': self._GRAPHQL_QUERY, - }, - note='Downloading season info') + }) for episode in season_info['episodes']: - yield self.url_result(f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId']) + yield self.url_result( + f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId']) def _real_extract(self, url): asin = f'amzn1.dv.gti.{self._match_id(url)}' - return self.playlist_result(self._entries(asin), playlist_id=asin) + return self.playlist_result(self._entries(asin), asin) -class AmazonMiniTVSeriesIE(AmazonMiniTVIE): +class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P[a-f0-9-]+)' _TESTS = [{ @@ -289,6 +260,7 @@ class AmazonMiniTVSeriesIE(AmazonMiniTVIE): 'url': 'amazonminitv:series:56521d46-b040-4fd5-872e-3e70476a04b0', 'only_matching': True, }] + _GRAPHQL_QUERY = ''' query getSeasons($sessionIdToken: String!, $deviceLocale: String, $episodeOrSeasonOrSeriesId: ID!, $clientId: String) { getSeasons( @@ -304,19 +276,15 @@ class AmazonMiniTVSeriesIE(AmazonMiniTVIE): def _entries(self, asin): season_info = self._call_api( - asin, - data={ + asin, note='Downloading series info', data={ 'operationName': 'getSeasons', - 'variables': { - 'episodeOrSeasonOrSeriesId': asin, - }, + 'variables': {'episodeOrSeasonOrSeriesId': asin}, 'query': self._GRAPHQL_QUERY, - }, - note='Downloading series info') + }) for season in season_info['seasons']: yield self.url_result(f'amazonminitv:season:{season["seasonId"]}', AmazonMiniTVSeasonIE, season['seasonId']) def _real_extract(self, url): asin = f'amzn1.dv.gti.{self._match_id(url)}' - return self.playlist_result(self._entries(asin), playlist_id=asin) + return self.playlist_result(self._entries(asin), asin) From 71eb82d1b2864927b62e0600c41b8b9db4071218 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 30 Nov 2022 05:17:45 +0530 Subject: [PATCH 012/153] [extractor/youtube] Subtitles cannot be translated to `und` Closes #5674 --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 79d082d0be..c6c89915b4 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4085,7 +4085,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if not trans_code: continue orig_trans_code = trans_code - if caption_track.get('kind') != 'asr': + if caption_track.get('kind') != 'asr' and trans_code != 'und': if not get_translated_subs: continue trans_code += f'-{lang_code}' From 9bcfe33be7f1aa7164e690ced133cae4b063efa4 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 30 Nov 2022 06:10:26 +0530 Subject: [PATCH 013/153] [utils] Make `ExtractorError` mutable --- yt_dlp/extractor/common.py | 14 ++++---------- yt_dlp/utils.py | 21 +++++++++++++++------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c2b9970ec8..3ca8fe24c1 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -692,16 +692,10 @@ def extract(self, url): except UnsupportedError: raise except ExtractorError as e: - kwargs = { - 'video_id': e.video_id or self.get_temp_id(url), - 'ie': self.IE_NAME, - 'tb': e.traceback or sys.exc_info()[2], - 'expected': e.expected, - 'cause': e.cause - } - if hasattr(e, 'countries'): - kwargs['countries'] = e.countries - raise type(e)(e.orig_msg, **kwargs) + e.video_id = e.video_id or self.get_temp_id(url), + e.ie = e.ie or self.IE_NAME, + e.traceback = e.traceback or sys.exc_info()[2] + raise except http.client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d351d0e36b..ed1b24335a 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1095,13 +1095,16 @@ def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=N self.exc_info = sys.exc_info() # preserve original exception if isinstance(self.exc_info[1], ExtractorError): self.exc_info = self.exc_info[1].exc_info + super().__init__(self.__msg) - super().__init__(''.join(( - format_field(ie, None, '[%s] '), - format_field(video_id, None, '%s: '), - msg, - format_field(cause, None, ' (caused by %r)'), - '' if expected else bug_reports_message()))) + @property + def __msg(self): + return ''.join(( + format_field(self.ie, None, '[%s] '), + format_field(self.video_id, None, '%s: '), + self.orig_msg, + format_field(self.cause, None, ' (caused by %r)'), + '' if self.expected else bug_reports_message())) def format_traceback(self): return join_nonempty( @@ -1109,6 +1112,12 @@ def format_traceback(self): self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]), delim='\n') or None + def __setattr__(self, name, value): + super().__setattr__(name, value) + if getattr(self, 'msg', None) and name not in ('msg', 'args'): + self.msg = self.__msg or type(self).__name__ + self.args = (self.msg, ) # Cannot be property + class UnsupportedError(ExtractorError): def __init__(self, url): From ba723997235fc50673dac8eae1503b509b7800d5 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Dec 2022 04:00:32 +0000 Subject: [PATCH 014/153] [extractor/tiktok] Fix subs, `DouyinIE`, improve `_VALID_URL` (#5676) Closes #5665, Closes #2267 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 152 ++++++++++++++++++++++++------------- 1 file changed, 99 insertions(+), 53 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 0ca6f5afda..1bbf88495e 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -16,6 +16,7 @@ int_or_none, join_nonempty, qualities, + remove_start, srt_subtitles_timecode, str_or_none, traverse_obj, @@ -51,7 +52,7 @@ def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.trill/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', 'Accept': 'application/json', }, query=query) @@ -126,11 +127,21 @@ def _call_api(self, ep, query, video_id, fatal=True, continue raise e + def _extract_aweme_app(self, aweme_id): + feed_list = self._call_api( + 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed', + errnote='Unable to download video feed').get('aweme_list') or [] + aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + if not aweme_detail: + raise ExtractorError('Unable to find video in feed', video_id=aweme_id) + return self._parse_aweme_video_app(aweme_detail) + def _get_subtitles(self, aweme_detail, aweme_id): # TODO: Extract text positioning info subtitles = {} + # aweme/detail endpoint subs captions_info = traverse_obj( - aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[]) + aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict) for caption in captions_info: caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False) if not caption_url: @@ -145,6 +156,24 @@ def _get_subtitles(self, aweme_detail, aweme_id): f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}' for i, line in enumerate(caption_json['utterances']) if line.get('text')) }) + # feed endpoint subs + if not subtitles: + for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict): + if not caption.get('url'): + continue + subtitles.setdefault(caption.get('lang') or 'en', []).append({ + 'ext': remove_start(caption.get('caption_format'), 'web'), + 'url': caption['url'], + }) + # webpage subs + if not subtitles: + for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', ...), expected_type=dict): + if not caption.get('Url'): + continue + subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({ + 'ext': remove_start(caption.get('Format'), 'web'), + 'url': caption['Url'], + }) return subtitles def _parse_aweme_video_app(self, aweme_detail): @@ -354,7 +383,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url): 'timestamp': int_or_none(aweme_detail.get('createTime')), 'creator': str_or_none(author_info.get('nickname')), 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')), - 'uploader_id': str_or_none(author_info.get('id') or aweme_detail.get('authorId')), + 'uploader_id': str_or_none(traverse_obj(author_info, 'id', 'uid', 'authorId')), 'uploader_url': user_url, 'track': str_or_none(music_info.get('title')), 'album': str_or_none(music_info.get('album')) or None, @@ -521,14 +550,6 @@ class TikTokIE(TikTokBaseIE): 'only_matching': True }] - def _extract_aweme_app(self, aweme_id): - feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] - aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) - if not aweme_detail: - raise ExtractorError('Unable to find video in feed', video_id=aweme_id) - return self._parse_aweme_video_app(aweme_detail) - def _real_extract(self, url): video_id, user_id = self._match_valid_url(url).group('id', 'user_id') try: @@ -763,56 +784,68 @@ def _real_extract(self, url): return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id) -class DouyinIE(TikTokIE): # XXX: Do not subclass from concrete IE +class DouyinIE(TikTokBaseIE): _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.douyin.com/video/6961737553342991651', - 'md5': '10523312c8b8100f353620ac9dc8f067', + 'md5': 'a97db7e3e67eb57bf40735c022ffa228', 'info_dict': { 'id': '6961737553342991651', 'ext': 'mp4', 'title': '#杨超越 小小水手带你去远航❤️', - 'uploader': '杨超越', - 'upload_date': '20210513', - 'timestamp': 1620905839, + 'description': '#杨超越 小小水手带你去远航❤️', 'uploader_id': '110403406559', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'creator': '杨超越', + 'duration': 19782, + 'timestamp': 1620905839, + 'upload_date': '20210513', + 'track': '@杨超越创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { 'url': 'https://www.douyin.com/video/6982497745948921092', - 'md5': 'd78408c984b9b5102904cf6b6bc2d712', + 'md5': '34a87ebff3833357733da3fe17e37c0e', 'info_dict': { 'id': '6982497745948921092', 'ext': 'mp4', 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想', - 'uploader': '杨超越工作室', - 'upload_date': '20210708', - 'timestamp': 1625739481, + 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'uploader_id': '408654318141572', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', + 'creator': '杨超越工作室', + 'duration': 42608, + 'timestamp': 1625739481, + 'upload_date': '20210708', + 'track': '@杨超越工作室创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { 'url': 'https://www.douyin.com/video/6953975910773099811', - 'md5': '72e882e24f75064c218b76c8b713c185', + 'md5': 'dde3302460f19db59c47060ff013b902', 'info_dict': { 'id': '6953975910773099811', 'ext': 'mp4', 'title': '#一起看海 出现在你的夏日里', - 'uploader': '杨超越', - 'upload_date': '20210422', - 'timestamp': 1619098692, + 'description': '#一起看海 出现在你的夏日里', 'uploader_id': '110403406559', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'creator': '杨超越', + 'duration': 17228, + 'timestamp': 1619098692, + 'upload_date': '20210422', + 'track': '@杨超越创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { 'url': 'https://www.douyin.com/video/6950251282489675042', 'md5': 'b4db86aec367ef810ddd38b1737d2fed', @@ -828,25 +861,30 @@ class DouyinIE(TikTokIE): # XXX: Do not subclass from concrete IE 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, + 'skip': 'No longer available', }, { 'url': 'https://www.douyin.com/video/6963263655114722595', - 'md5': '1abe1c477d05ee62efb40bf2329957cf', + 'md5': 'cf9f11f0ec45d131445ec2f06766e122', 'info_dict': { 'id': '6963263655114722595', 'ext': 'mp4', 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈', - 'uploader': '杨超越', - 'upload_date': '20210517', - 'timestamp': 1621261163, + 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'uploader_id': '110403406559', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'creator': '杨超越', + 'duration': 15115, + 'timestamp': 1621261163, + 'upload_date': '20210517', + 'track': '@杨超越创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }] - _APP_VERSIONS = [('9.6.0', '960')] + _APP_VERSIONS = [('23.3.0', '230300')] _APP_NAME = 'aweme' _AID = 1128 _API_HOSTNAME = 'aweme.snssdk.com' @@ -859,7 +897,8 @@ def _real_extract(self, url): try: return self._extract_aweme_app(video_id) except ExtractorError as e: - self.report_warning(f'{e}; trying with webpage') + e.expected = True + self.to_screen(f'{e}; trying with webpage') webpage = self._download_webpage(url, video_id) render_data_json = self._search_regex( @@ -867,7 +906,10 @@ def _real_extract(self, url): webpage, 'render data', default=None) if not render_data_json: # TODO: Run verification challenge code to generate signature cookies - raise ExtractorError('Fresh cookies (not necessarily logged in) are needed') + cookies = self._get_cookies(self._WEBPAGE_HOST) + expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid') + raise ExtractorError( + 'Fresh cookies (not necessarily logged in) are needed', expected=expected) render_data = self._parse_json( render_data_json, video_id, transform_source=compat_urllib_parse_unquote) @@ -875,31 +917,35 @@ def _real_extract(self, url): class TikTokVMIE(InfoExtractor): - _VALID_URL = r'https?://(?:vm|vt)\.tiktok\.com/(?P\w+)' + _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P\w+)' IE_NAME = 'vm.tiktok' _TESTS = [{ - 'url': 'https://vm.tiktok.com/ZSe4FqkKd', + 'url': 'https://www.tiktok.com/t/ZTRC5xgJp', 'info_dict': { - 'id': '7023491746608712966', + 'id': '7170520270497680683', 'ext': 'mp4', - 'title': 'md5:5607564db90271abbbf8294cca77eddd', - 'description': 'md5:5607564db90271abbbf8294cca77eddd', - 'duration': 11, - 'upload_date': '20211026', - 'uploader_id': '7007385080558846981', - 'creator': 'Memes', - 'artist': 'Memes', - 'track': 'original sound', - 'uploader': 'susmandem', - 'timestamp': 1635284105, - 'thumbnail': r're:https://.+\.webp.*', - 'like_count': int, + 'title': 'md5:c64f6152330c2efe98093ccc8597871c', + 'uploader_id': '6687535061741700102', + 'upload_date': '20221127', 'view_count': int, + 'like_count': int, 'comment_count': int, + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX', + 'album': 'Wave of Mutilation: Best of Pixies', + 'thumbnail': r're:https://.+\.webp.*', + 'duration': 5, + 'timestamp': 1669516858, 'repost_count': int, - 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAXcNoOEOxVyBzuII_E--T0MeCrLP0ay1Sm6x_n3dluiWEoWZD0VlQOytwad4W0i0n', - } + 'artist': 'Pixies', + 'track': 'Where Is My Mind?', + 'description': 'md5:c64f6152330c2efe98093ccc8597871c', + 'uploader': 'sigmachaddeus', + 'creator': 'SigmaChad', + }, + }, { + 'url': 'https://vm.tiktok.com/ZSe4FqkKd', + 'only_matching': True, }, { 'url': 'https://vt.tiktok.com/ZSe4FqkKd', 'only_matching': True, From 0e96b408b994678764a89cabbb3879b2c383624a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Dec 2022 04:04:32 +0000 Subject: [PATCH 015/153] [extractor/reddit] Extract video embeds in text posts (#5677) Closes #5612 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 45 +++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 171affb932..f1a5c852af 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -1,15 +1,15 @@ import random -from urllib.parse import urlparse +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, + traverse_obj, try_get, unescapeHTML, url_or_none, - traverse_obj ) @@ -56,6 +56,14 @@ class RedditIE(InfoExtractor): 'comment_count': int, 'age_limit': 0, }, + }, { + # videos embedded in reddit text post + 'url': 'https://www.reddit.com/r/KamenRider/comments/wzqkxp/finale_kamen_rider_revice_episode_50_family_to/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'wzqkxp', + 'title': 'md5:72d3d19402aa11eff5bd32fc96369b37', + }, }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, @@ -102,10 +110,6 @@ def _real_extract(self, url): data = data[0]['data']['children'][0]['data'] video_url = data['url'] - # Avoid recursing into the same reddit URL - if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: - raise ExtractorError('No media found', expected=True) - over_18 = data.get('over_18') if over_18 is True: age_limit = 18 @@ -148,6 +152,32 @@ def add_thumbnail(src): 'age_limit': age_limit, } + parsed_url = urllib.parse.urlparse(video_url) + + # Check for embeds in text posts, or else raise to avoid recursing into the same reddit URL + if 'reddit.com' in parsed_url.netloc and f'/{video_id}/' in parsed_url.path: + entries = [] + for media in traverse_obj(data, ('media_metadata', ...), expected_type=dict): + if not media.get('id') or media.get('e') != 'RedditVideo': + continue + formats = [] + if media.get('hlsUrl'): + formats.extend(self._extract_m3u8_formats( + unescapeHTML(media['hlsUrl']), video_id, 'mp4', m3u8_id='hls', fatal=False)) + if media.get('dashUrl'): + formats.extend(self._extract_mpd_formats( + unescapeHTML(media['dashUrl']), video_id, mpd_id='dash', fatal=False)) + if formats: + entries.append({ + 'id': media['id'], + 'display_id': video_id, + 'formats': formats, + **info, + }) + if entries: + return self.playlist_result(entries, video_id, info.get('title')) + raise ExtractorError('No media found', expected=True) + # Check if media is hosted on reddit: reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) if reddit_video: @@ -189,7 +219,6 @@ def add_thumbnail(src): 'duration': int_or_none(reddit_video.get('duration')), } - parsed_url = urlparse(video_url) if parsed_url.netloc == 'v.redd.it': self.raise_no_formats('This video is processing', expected=True, video_id=video_id) return { From ddf1e22d48530819d60220d0bdc36e20f5b8483b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Dec 2022 11:24:43 +0000 Subject: [PATCH 016/153] [extractor/swearnet] Fix description bug (#5681) Bug in 049565df2e24d9611a9ffdd033c80a6dafdabbe0 Closes #5643 Authoried by: bashonly --- yt_dlp/extractor/swearnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/swearnet.py b/yt_dlp/extractor/swearnet.py index 86a303ec73..6e216a2a56 100644 --- a/yt_dlp/extractor/swearnet.py +++ b/yt_dlp/extractor/swearnet.py @@ -62,7 +62,7 @@ def _real_extract(self, url): 'id': str(json_data['videoId']), 'title': json_data.get('name') or self._html_search_meta(['og:title', 'twitter:title'], webpage), 'description': (json_data.get('description') - or self._html_search_meta(['og:description', 'twitter:description'])), + or self._html_search_meta(['og:description', 'twitter:description'], webpage)), 'duration': int_or_none(json_data.get('seconds')), 'formats': formats, 'subtitles': subtitles, From c9f5ce511877ae4f22d2eb2f70c3c6edf6c1971d Mon Sep 17 00:00:00 2001 From: Benjamin Ryan Date: Fri, 2 Dec 2022 03:38:00 -0600 Subject: [PATCH 017/153] [extractor/tiktok] Update API hostname (#5690) Closes #5688 Authored by: redraskal --- yt_dlp/extractor/tiktok.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 1bbf88495e..95223f5de9 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -30,7 +30,7 @@ class TikTokBaseIE(InfoExtractor): _WORKING_APP_VERSION = None _APP_NAME = 'trill' _AID = 1180 - _API_HOSTNAME = 'api-h2.tiktokv.com' + _API_HOSTNAME = 'api16-normal-c-useast1a.tiktokv.com' _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') From 71df9b7fd504767583cf1e088ae307c942799f2b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 30 Nov 2022 11:34:51 +0530 Subject: [PATCH 018/153] [cleanup] Misc --- .github/workflows/core.yml | 11 ++++++----- .github/workflows/quick-test.yml | 13 ++++++------- .gitignore | 1 + CONTRIBUTING.md | 22 +++++++++++++++++++--- README.md | 26 +++++++++++++------------- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/__init__.py | 7 +++---- yt_dlp/downloader/common.py | 5 ++++- yt_dlp/extractor/common.py | 12 +++++++++++- yt_dlp/options.py | 8 ++++---- yt_dlp/utils.py | 5 ++++- 11 files changed, 72 insertions(+), 40 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index e129186265..dead444c0b 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -12,13 +12,13 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - # CPython 3.9 is in quick-test - python-version: ['3.7', '3.10', 3.11-dev, pypy-3.7, pypy-3.8] + # CPython 3.11 is in quick-test + python-version: ['3.8', '3.9', '3.10', pypy-3.7, pypy-3.8] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.8' + python-version: '3.7' run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 @@ -33,5 +33,6 @@ jobs: run: pip install pytest - name: Run tests continue-on-error: False - run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} core - # Linter is in quick-test + run: | + python3 -m yt_dlp -v || true # Print debug head + ./devscripts/run_tests.${{ matrix.run-tests-ext }} core diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 8a0ac98bb8..930e58152d 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -10,24 +10,23 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Set up Python + - name: Set up Python 3.11 uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: '3.11' - name: Install test requirements run: pip install pytest pycryptodomex - name: Run tests - run: ./devscripts/run_tests.sh core + run: | + python3 -m yt_dlp -v || true + ./devscripts/run_tests.sh core flake8: name: Linter if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.9 + - uses: actions/setup-python@v4 - name: Install flake8 run: pip install flake8 - name: Make lazy extractors diff --git a/.gitignore b/.gitignore index 0ce059b34d..00d74057fa 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,7 @@ dist/ zip/ tmp/ venv/ +.venv/ completions/ # Misc diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8ac671dcf..551db674e2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -351,8 +351,9 @@ #### Example ```python thumbnail_data = data.get('thumbnails') or [] thumbnails = [{ - 'url': item['url'] -} for item in thumbnail_data] # correct + 'url': item['url'], + 'height': item.get('h'), +} for item in thumbnail_data if item.get('url')] # correct ``` and not like: @@ -360,12 +361,27 @@ #### Example ```python thumbnail_data = data.get('thumbnails') thumbnails = [{ - 'url': item['url'] + 'url': item['url'], + 'height': item.get('h'), } for item in thumbnail_data] # incorrect ``` In this case, `thumbnail_data` will be `None` if the field was not found and this will cause the loop `for item in thumbnail_data` to raise a fatal error. Using `or []` avoids this error and results in setting an empty list in `thumbnails` instead. +Alternately, this can be further simplified by using `traverse_obj` + +```python +thumbnails = [{ + 'url': item['url'], + 'height': item.get('h'), +} for item in traverse_obj(data, ('thumbnails', lambda _, v: v['url']))] +``` + +or, even better, + +```python +thumbnails = traverse_obj(data, ('thumbnails', ..., {'url': 'url', 'height': 'h'})) +``` ### Provide fallbacks diff --git a/README.md b/README.md index fa55d130bb..b6a07da9a8 100644 --- a/README.md +++ b/README.md @@ -432,19 +432,19 @@ ## Geo-restriction: explicitly provided IP block in CIDR notation ## Video Selection: - -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the videos + -I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]". For backward compatibility, START-STOP is also supported. Use negative indices to count from the right and negative STEP to download in reverse order. E.g. "-I 1:3,7,-5::2" used on a - playlist of size 15 will download the videos + playlist of size 15 will download the items at index 1,2,3,7,11,13,15 - --min-filesize SIZE Do not download any videos smaller than + --min-filesize SIZE Abort download if filesize is smaller than + SIZE, e.g. 50k or 44.6M + --max-filesize SIZE Abort download if filesize is larger than SIZE, e.g. 50k or 44.6M - --max-filesize SIZE Do not download any videos larger than SIZE, - e.g. 50k or 44.6M --date DATE Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format [now|today|yesterday][-N[day|week|month|year]]. @@ -491,9 +491,9 @@ ## Video Selection: a file that is in the archive --break-on-reject Stop the download process when encountering a file that has been filtered out - --break-per-input --break-on-existing, --break-on-reject, - --max-downloads, and autonumber resets per - input URL + --break-per-input Alters --max-downloads, --break-on-existing, + --break-on-reject, and autonumber to reset + per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue --skip-playlist-after-errors N Number of allowed failures until the rest of @@ -1046,10 +1046,10 @@ ## SponsorBlock Options: for, separated by commas. Available categories are sponsor, intro, outro, selfpromo, preview, filler, interaction, - music_offtopic, poi_highlight, chapter, all and - default (=all). You can prefix the category - with a "-" to exclude it. See [1] for - description of the categories. E.g. + music_offtopic, poi_highlight, chapter, all + and default (=all). You can prefix the + category with a "-" to exclude it. See [1] + for description of the categories. E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories --sponsorblock-remove CATS SponsorBlock categories to be removed from @@ -1058,7 +1058,7 @@ ## SponsorBlock Options: remove takes precedence. The syntax and available categories are the same as for --sponsorblock-mark except that "default" - refers to "all,-filler" and poi_highlight and + refers to "all,-filler" and poi_highlight, chapter are not available --sponsorblock-chapter-title TEMPLATE An output template for the title of the diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index b1d009280e..8d28783d86 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3123,7 +3123,7 @@ def existing_video_file(*filepaths): fd, success = None, True if info_dict.get('protocol') or info_dict.get('url'): fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') - if fd is not FFmpegFD and ( + if fd is not FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( info_dict.get('section_start') or info_dict.get('section_end')): msg = ('This format cannot be partially downloaded' if FFmpegFD.available() else 'You have requested downloading the video partially, but ffmpeg is not installed') diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index f1a3475140..f1d6c369bd 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -91,12 +91,11 @@ def get_urls(urls, batchfile, verbose): def print_extractor_information(opts, urls): - # Importing GenericIE is currently slow since it imports other extractors - # TODO: Move this back to module level after generalization of embed detection - from .extractor.generic import GenericIE - out = '' if opts.list_extractors: + # Importing GenericIE is currently slow since it imports YoutubeIE + from .extractor.generic import GenericIE + urls = dict.fromkeys(urls, False) for ie in list_extractor_classes(opts.age_limit): out += ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n' diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index fe36332506..077b29b41f 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -20,6 +20,7 @@ RetryManager, classproperty, decodeArgument, + deprecation_warning, encodeFilename, format_bytes, join_nonempty, @@ -180,7 +181,9 @@ def best_block_size(elapsed_time, bytes): @staticmethod def parse_bytes(bytestr): """Parse a string indicating a byte quantity into an integer.""" - parse_bytes(bytestr) + deprecation_warning('yt_dlp.FileDownloader.parse_bytes is deprecated and ' + 'may be removed in the future. Use yt_dlp.utils.parse_bytes instead') + return parse_bytes(bytestr) def slow_down(self, start_time, now, byte_counter): """Sleep if the download speed is over the rate limit.""" diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 3ca8fe24c1..3910c55adb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -71,6 +71,7 @@ str_to_int, strip_or_none, traverse_obj, + truncate_string, try_call, try_get, unescapeHTML, @@ -674,7 +675,8 @@ def extract(self, url): for _ in range(2): try: self.initialize() - self.write_debug('Extracting URL: %s' % url) + self.to_screen('Extracting URL: %s' % ( + url if self.get_param('verbose') else truncate_string(url, 100, 20))) ie_result = self._real_extract(url) if ie_result is None: return None @@ -1906,6 +1908,14 @@ def _extract_m3u8_formats_and_subtitles( errnote=None, fatal=True, live=False, data=None, headers={}, query={}): + if not m3u8_url: + if errnote is not False: + errnote = errnote or 'Failed to obtain m3u8 URL' + if fatal: + raise ExtractorError(errnote, video_id=video_id) + self.report_warning(f'{errnote}{bug_reports_message()}') + return [], {} + res = self._download_webpage_handle( m3u8_url, video_id, note='Downloading m3u8 information' if note is None else note, diff --git a/yt_dlp/options.py b/yt_dlp/options.py index bee867aa94..bc574b8857 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -535,10 +535,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '-I', '--playlist-items', dest='playlist_items', metavar='ITEM_SPEC', default=None, help=( - 'Comma separated playlist_index of the videos to download. ' + 'Comma separated playlist_index of the items to download. ' 'You can specify a range using "[START]:[STOP][:STEP]". For backward compatibility, START-STOP is also supported. ' 'Use negative indices to count from the right and negative STEP to download in reverse order. ' - 'E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15')) + 'E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the items at index 1,2,3,7,11,13,15')) selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', @@ -554,7 +554,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): selection.add_option( '--max-filesize', metavar='SIZE', dest='max_filesize', default=None, - help='Abort download if filesize if larger than SIZE, e.g. 50k or 44.6M') + help='Abort download if filesize is larger than SIZE, e.g. 50k or 44.6M') selection.add_option( '--date', metavar='DATE', dest='date', default=None, @@ -635,7 +635,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='--break-on-existing, --break-on-reject, --max-downloads, and autonumber resets per input URL') + help='Alters --max-downloads, --break-on-existing, --break-on-reject, and autonumber to reset per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index ed1b24335a..a3da3c69ec 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3872,6 +3872,9 @@ def __eq__(self, other): return (isinstance(other, download_range_func) and self.chapters == other.chapters and self.ranges == other.ranges) + def __repr__(self): + return f'{type(self).__name__}({self.chapters}, {self.ranges})' + def parse_dfxp_time_expr(time_expr): if not time_expr: @@ -5976,7 +5979,7 @@ def truncate_string(s, left, right=0): assert left > 3 and right >= 0 if s is None or len(s) <= left + right: return s - return f'{s[:left-3]}...{s[-right:]}' + return f'{s[:left-3]}...{s[-right:] if right else ""}' def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None): From c53a18f016fe6ff774411d938c9959097f00b44c Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 5 Dec 2022 01:06:37 +0530 Subject: [PATCH 019/153] [utils] windows_enable_vt_mode: Proper implementation Authored by: Grub4K --- yt_dlp/utils.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index a3da3c69ec..36170e125e 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5579,17 +5579,39 @@ def supports_terminal_sequences(stream): return False -def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 +def windows_enable_vt_mode(): + """Ref: https://bugs.python.org/issue30075 """ if get_windows_version() < (10, 0, 10586): return - global WINDOWS_VT_MODE - try: - Popen.run('', shell=True) - except Exception: - return - WINDOWS_VT_MODE = True - supports_terminal_sequences.cache_clear() + import ctypes + import ctypes.wintypes + import msvcrt + + ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 + + dll = ctypes.WinDLL('kernel32', use_last_error=False) + handle = os.open('CONOUT$', os.O_RDWR) + + try: + h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle)) + dw_original_mode = ctypes.wintypes.DWORD() + success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode)) + if not success: + raise Exception('GetConsoleMode failed') + + success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD( + dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) + if not success: + raise Exception('SetConsoleMode failed') + except Exception as e: + write_string(f'WARNING: Cannot enable VT mode - {e}') + else: + global WINDOWS_VT_MODE + WINDOWS_VT_MODE = True + supports_terminal_sequences.cache_clear() + finally: + os.close(handle) _terminal_sequences_re = re.compile('\033\\[[^m]+m') From c4cbd3bebd33d2d77fa340a4035447ab1b9eb3eb Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Dec 2022 22:30:31 +0000 Subject: [PATCH 020/153] [extractor/tiktok] Update `_VALID_URL`, add `api_hostname` arg (#5708) Closes #5706 Authored by: bashonly --- README.md | 1 + yt_dlp/extractor/tiktok.py | 31 +++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b6a07da9a8..8fdedacf59 100644 --- a/README.md +++ b/README.md @@ -1765,6 +1765,7 @@ #### hotstar * `dr`: dynamic range to ignore - one or more of `sdr`, `hdr10`, `dv` #### tiktok +* `api_hostname`: Hostname to use for mobile API requests, e.g. `api-h2.tiktokv.com` * `app_version`: App version to call mobile APIs with - should be set along with `manifest_app_version`, e.g. `20.2.1` * `manifest_app_version`: Numeric app version to call mobile APIs with, e.g. `221` diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 95223f5de9..2dd4510cc3 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -30,11 +30,15 @@ class TikTokBaseIE(InfoExtractor): _WORKING_APP_VERSION = None _APP_NAME = 'trill' _AID = 1180 - _API_HOSTNAME = 'api16-normal-c-useast1a.tiktokv.com' _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + @property + def _API_HOSTNAME(self): + return self._configuration_arg( + 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0] + @staticmethod def _create_url(user_id, video_id): return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' @@ -398,7 +402,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url): class TikTokIE(TikTokBaseIE): - _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P[\w\.-]+)/video)/(?P\d+)' + _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P[\w\.-]+)?/video)/(?P\d+)' _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P{_VALID_URL})'] _TESTS = [{ @@ -944,8 +948,27 @@ class TikTokVMIE(InfoExtractor): 'creator': 'SigmaChad', }, }, { - 'url': 'https://vm.tiktok.com/ZSe4FqkKd', - 'only_matching': True, + 'url': 'https://vm.tiktok.com/ZTR45GpSF/', + 'info_dict': { + 'id': '7106798200794926362', + 'ext': 'mp4', + 'title': 'md5:edc3e7ea587847f8537468f2fe51d074', + 'uploader_id': '6997695878846268418', + 'upload_date': '20220608', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:https://.+\.webp.*', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO', + 'duration': 29, + 'timestamp': 1654680400, + 'repost_count': int, + 'artist': 'Akihitoko', + 'track': 'original sound', + 'description': 'md5:edc3e7ea587847f8537468f2fe51d074', + 'uploader': 'akihitoko1', + 'creator': 'Akihitoko', + }, }, { 'url': 'https://vt.tiktok.com/ZSe4FqkKd', 'only_matching': True, From 935bac1e4de35107a15ea2ad45402f507527dcfb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 6 Dec 2022 00:35:08 +0530 Subject: [PATCH 021/153] Fix `--cookies-from-browser` CLI parsing Closes #5716 --- yt_dlp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index f1d6c369bd..202f102ba9 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -350,7 +350,7 @@ def parse_chapters(name, value): mobj = re.fullmatch(r'''(?x) (?P[^+:]+) (?:\s*\+\s*(?P[^:]+))? - (?:\s*:\s*(?P.+?))? + (?:\s*:\s*(?!:)(?P.+?))? (?:\s*::\s*(?P.+))? ''', opts.cookiesfrombrowser) if mobj is None: From 7991ae57a800316930e20a15df8314616c5cba8f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 8 Dec 2022 17:17:16 +0530 Subject: [PATCH 022/153] [extractor/sibnet] Separate from VKIE Fixes https://github.com/yt-dlp/yt-dlp/commit/bfd973ece3369c593b5e82a88cc16de80088a73e#commitcomment-91834251 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/generic.py | 5 ----- yt_dlp/extractor/sibnet.py | 17 +++++++++++++++++ yt_dlp/extractor/vk.py | 6 +++--- 4 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 yt_dlp/extractor/sibnet.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2fe15f6d28..1372840893 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1639,6 +1639,7 @@ VivoIE, ) from .sharevideos import ShareVideosEmbedIE +from .sibnet import SibnetEmbedIE from .shemaroome import ShemarooMeIE from .showroomlive import ShowRoomLiveIE from .simplecast import ( diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 2fcbc6f43f..190aff3312 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1864,11 +1864,6 @@ class GenericIE(InfoExtractor): 'title': 'I AM BIO Podcast | BIO', }, 'playlist_mincount': 52, - }, - { - # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) - 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', - 'only_matching': True, }, { # WimTv embed player 'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/', diff --git a/yt_dlp/extractor/sibnet.py b/yt_dlp/extractor/sibnet.py new file mode 100644 index 0000000000..73bb75d8f2 --- /dev/null +++ b/yt_dlp/extractor/sibnet.py @@ -0,0 +1,17 @@ +from .common import InfoExtractor + + +class SibnetEmbedIE(InfoExtractor): + # Ref: https://help.sibnet.ru/?sibnet_video_embed + _VALID_URL = False + _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1'] + _WEBPAGE_TESTS = [{ + 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', + 'info_dict': { + 'id': 'shell', # FIXME? + 'ext': 'mp4', + 'age_limit': 0, + 'thumbnail': 'https://video.sibnet.ru/upload/cover/video_1887072_0.jpg', + 'title': 'КВН Москва не сразу строилась - Девушка впервые играет в Mortal Kombat', + } + }] diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 347aa381d0..0fb95c863e 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -6,6 +6,7 @@ from .dailymotion import DailymotionIE from .odnoklassniki import OdnoklassnikiIE from .pladform import PladformIE +from .sibnet import SibnetEmbedIE from .vimeo import VimeoIE from .youtube import YoutubeIE from ..compat import compat_urlparse @@ -101,8 +102,7 @@ class VKIE(VKBaseIE): (?P-?\d+_\d+)(?:.*\blist=(?P([\da-f]+)|(ln-[\da-zA-Z]+)))? ) ''' - # https://help.sibnet.ru/?sibnet_video_embed - _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1'] + _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', @@ -455,7 +455,7 @@ def _real_extract(self, url): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - sibnet_url = next(self._extract_embed_urls(url, info_page), None) + sibnet_url = next(SibnetEmbedIE._extract_embed_urls(url, info_page), None) if sibnet_url: return self.url_result(sibnet_url) From 42ec478fc4abe4131a0908881673a19aa750bc97 Mon Sep 17 00:00:00 2001 From: David Turner <547637+digitall@users.noreply.github.com> Date: Thu, 8 Dec 2022 12:38:52 +0000 Subject: [PATCH 023/153] [extractor/plutotv] Fix videos with non-zero start (#5745) Authored by: digitall --- yt_dlp/extractor/plutotv.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py index 71a05cc7a8..caffeb21df 100644 --- a/yt_dlp/extractor/plutotv.py +++ b/yt_dlp/extractor/plutotv.py @@ -84,6 +84,17 @@ class PlutoTVIE(InfoExtractor): }, { 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1', 'only_matching': True, + }, + { + 'url': 'https://pluto.tv/en/on-demand/movies/attack-of-the-killer-tomatoes-1977-1-1-ptv1', + 'md5': '7db56369c0da626a32d505ec6eb3f89f', + 'info_dict': { + 'id': '5b190c7bb0875c36c90c29c4', + 'ext': 'mp4', + 'title': 'Attack of the Killer Tomatoes', + 'description': 'A group of scientists band together to save the world from mutated tomatoes that KILL! (1978)', + 'duration': 5700, + } } ] @@ -103,7 +114,7 @@ def _to_ad_free_formats(self, video_id, formats, subtitles): compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8')) continue first_segment_url = re.search( - r'^(https?://.*/).+\-0+\.ts$', res, + r'^(https?://.*/).+\-0+[0-1]0\.ts$', res, re.MULTILINE) if first_segment_url: m3u8_urls.add( From dfc186d4220081fdf7184347187639b15ab68a2f Mon Sep 17 00:00:00 2001 From: lkw123 <2020393267@qq.com> Date: Thu, 8 Dec 2022 20:43:29 +0800 Subject: [PATCH 024/153] [extractor/xiami] Remove extractors (#5711) Authored by: synthpop123 --- supportedsites.md | 4 - yt_dlp/extractor/_extractors.py | 6 - yt_dlp/extractor/xiami.py | 198 -------------------------------- 3 files changed, 208 deletions(-) delete mode 100644 yt_dlp/extractor/xiami.py diff --git a/supportedsites.md b/supportedsites.md index d7565c139f..fbada177e4 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1624,10 +1624,6 @@ # Supported sites - **XHamster** - **XHamsterEmbed** - **XHamsterUser** - - **xiami:album**: 虾米音乐 - 专辑 - - **xiami:artist**: 虾米音乐 - 歌手 - - **xiami:collection**: 虾米音乐 - 精选集 - - **xiami:song**: 虾米音乐 - **ximalaya**: 喜马拉雅FM - **ximalaya:album**: 喜马拉雅FM 专辑 - **xinpianchang**: xinpianchang.com diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1372840893..54ac1b7309 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2236,12 +2236,6 @@ XHamsterEmbedIE, XHamsterUserIE, ) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) from .ximalaya import ( XimalayaIE, XimalayaAlbumIE diff --git a/yt_dlp/extractor/xiami.py b/yt_dlp/extractor/xiami.py deleted file mode 100644 index 71b2956a8e..0000000000 --- a/yt_dlp/extractor/xiami.py +++ /dev/null @@ -1,198 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import int_or_none - - -class XiamiBaseIE(InfoExtractor): - _API_BASE_URL = 'https://emumo.xiami.com/song/playlist/cat/json/id' - - def _download_webpage_handle(self, *args, **kwargs): - webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs) - if '>Xiami is currently not available in your country.<' in webpage: - self.raise_geo_restricted('Xiami is currently not available in your country') - return webpage - - def _extract_track(self, track, track_id=None): - track_name = track.get('songName') or track.get('name') or track['subName'] - artist = track.get('artist') or track.get('artist_name') or track.get('singers') - title = '%s - %s' % (artist, track_name) if artist else track_name - track_url = self._decrypt(track['location']) - - subtitles = {} - lyrics_url = track.get('lyric_url') or track.get('lyric') - if lyrics_url and lyrics_url.startswith('http'): - subtitles['origin'] = [{'url': lyrics_url}] - - return { - 'id': track.get('song_id') or track_id, - 'url': track_url, - 'title': title, - 'thumbnail': track.get('pic') or track.get('album_pic'), - 'duration': int_or_none(track.get('length')), - 'creator': track.get('artist', '').split(';')[0], - 'track': track_name, - 'track_number': int_or_none(track.get('track')), - 'album': track.get('album_name') or track.get('title'), - 'artist': artist, - 'subtitles': subtitles, - } - - def _extract_tracks(self, item_id, referer, typ=None): - playlist = self._download_json( - '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), - item_id, headers={ - 'Referer': referer, - }) - return [ - self._extract_track(track, item_id) - for track in playlist['data']['trackList']] - - @staticmethod - def _decrypt(origin): - n = int(origin[0]) - origin = origin[1:] - short_length = len(origin) // n - long_num = len(origin) - short_length * n - l = tuple() - for i in range(0, n): - length = short_length - if i < long_num: - length += 1 - l += (origin[0:length], ) - origin = origin[length:] - ans = '' - for i in range(0, short_length + 1): - for j in range(0, n): - if len(l[j]) > i: - ans += l[j][i] - return compat_urllib_parse_unquote(ans).replace('^', '0') - - -class XiamiSongIE(XiamiBaseIE): - IE_NAME = 'xiami:song' - IE_DESC = '虾米音乐' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.xiami.com/song/1775610518', - 'md5': '521dd6bea40fd5c9c69f913c232cb57e', - 'info_dict': { - 'id': '1775610518', - 'ext': 'mp3', - 'title': 'HONNE - Woman', - 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', - 'duration': 265, - 'creator': 'HONNE', - 'track': 'Woman', - 'album': 'Woman', - 'artist': 'HONNE', - 'subtitles': { - 'origin': [{ - 'ext': 'lrc', - }], - }, - }, - 'skip': 'Georestricted', - }, { - 'url': 'http://www.xiami.com/song/1775256504', - 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', - 'info_dict': { - 'id': '1775256504', - 'ext': 'mp3', - 'title': '戴荃 - 悟空', - 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', - 'duration': 200, - 'creator': '戴荃', - 'track': '悟空', - 'album': '悟空', - 'artist': '戴荃', - 'subtitles': { - 'origin': [{ - 'ext': 'lrc', - }], - }, - }, - 'skip': 'Georestricted', - }, { - 'url': 'http://www.xiami.com/song/1775953850', - 'info_dict': { - 'id': '1775953850', - 'ext': 'mp3', - 'title': 'До Скону - Чума Пожирает Землю', - 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', - 'duration': 683, - 'creator': 'До Скону', - 'track': 'Чума Пожирает Землю', - 'track_number': 7, - 'album': 'Ад', - 'artist': 'До Скону', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.xiami.com/song/xLHGwgd07a1', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self._extract_tracks(self._match_id(url), url)[0] - - -class XiamiPlaylistBaseIE(XiamiBaseIE): - def _real_extract(self, url): - item_id = self._match_id(url) - return self.playlist_result(self._extract_tracks(item_id, url, self._TYPE), item_id) - - -class XiamiAlbumIE(XiamiPlaylistBaseIE): - IE_NAME = 'xiami:album' - IE_DESC = '虾米音乐 - 专辑' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P[^/?#&]+)' - _TYPE = '1' - _TESTS = [{ - 'url': 'http://www.xiami.com/album/2100300444', - 'info_dict': { - 'id': '2100300444', - }, - 'playlist_count': 10, - 'skip': 'Georestricted', - }, { - 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', - 'only_matching': True, - }, { - 'url': 'http://www.xiami.com/album/URVDji2a506', - 'only_matching': True, - }] - - -class XiamiArtistIE(XiamiPlaylistBaseIE): - IE_NAME = 'xiami:artist' - IE_DESC = '虾米音乐 - 歌手' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P[^/?#&]+)' - _TYPE = '2' - _TESTS = [{ - 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp', - 'info_dict': { - 'id': '2132', - }, - 'playlist_count': 20, - 'skip': 'Georestricted', - }, { - 'url': 'http://www.xiami.com/artist/bC5Tk2K6eb99', - 'only_matching': True, - }] - - -class XiamiCollectionIE(XiamiPlaylistBaseIE): - IE_NAME = 'xiami:collection' - IE_DESC = '虾米音乐 - 精选集' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P[^/?#&]+)' - _TYPE = '3' - _TEST = { - 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr', - 'info_dict': { - 'id': '156527391', - }, - 'playlist_mincount': 29, - 'skip': 'Georestricted', - } From 28b8f57b4b2a2e1bd1fbe68ae1ab2c44fdd51992 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Thu, 8 Dec 2022 22:58:36 +0900 Subject: [PATCH 025/153] [extractor/noice] Add NoicePodcast extractor (#5621) Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/noice.py | 116 ++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 yt_dlp/extractor/noice.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 54ac1b7309..c9dd7463c7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1211,6 +1211,7 @@ from .nitter import NitterIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE +from .noice import NoicePodcastIE from .nonktube import NonkTubeIE from .noodlemagazine import NoodleMagazineIE from .noovo import NoovoIE diff --git a/yt_dlp/extractor/noice.py b/yt_dlp/extractor/noice.py new file mode 100644 index 0000000000..e6e343303a --- /dev/null +++ b/yt_dlp/extractor/noice.py @@ -0,0 +1,116 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + parse_iso8601, + traverse_obj, + variadic, +) + + +class NoicePodcastIE(InfoExtractor): + _VALID_URL = r'https?://open\.noice\.id/content/(?P[a-fA-F0-9-]+)' + _TESTS = [{ + 'url': 'https://open.noice.id/content/7694bb04-ff0f-40fa-a60b-5b39f29584b2', + 'info_dict': { + 'id': '7694bb04-ff0f-40fa-a60b-5b39f29584b2', + 'ext': 'm4a', + 'season': 'Season 1', + 'description': 'md5:58d1274e6857b6fbbecf47075885380d', + 'release_date': '20221115', + 'timestamp': 1668496642, + 'season_number': 1, + 'upload_date': '20221115', + 'release_timestamp': 1668496642, + 'title': 'Eps 1. Belajar dari Wishnutama: Kreatif Bukan Followers! (bersama Wishnutama)', + 'modified_date': '20221121', + 'categories': ['Bisnis dan Keuangan'], + 'duration': 3567, + 'modified_timestamp': 1669030647, + 'thumbnail': 'https://images.noiceid.cc/catalog/content-1668496302560', + 'channel_id': '9dab1024-5b92-4265-ae1c-63da87359832', + 'like_count': int, + 'channel': 'Noice Space Talks', + 'comment_count': int, + 'dislike_count': int, + 'channel_follower_count': int, + } + }, { + 'url': 'https://open.noice.id/content/222134e4-99f2-456f-b8a2-b8be404bf063', + 'info_dict': { + 'id': '222134e4-99f2-456f-b8a2-b8be404bf063', + 'ext': 'm4a', + 'release_timestamp': 1653488220, + 'description': 'md5:35074f6190cef52b05dd133bb2ef460e', + 'upload_date': '20220525', + 'timestamp': 1653460637, + 'release_date': '20220525', + 'thumbnail': 'https://images.noiceid.cc/catalog/content-1653460337625', + 'title': 'Eps 1: Dijodohin Sama Anak Pak RT', + 'modified_timestamp': 1669030647, + 'season_number': 1, + 'modified_date': '20221121', + 'categories': ['Cerita dan Drama'], + 'duration': 1830, + 'season': 'Season 1', + 'channel_id': '60193f6b-d24d-4b23-913b-ceed5a731e74', + 'dislike_count': int, + 'like_count': int, + 'comment_count': int, + 'channel': 'Dear Jerome', + 'channel_follower_count': int, + } + }] + + def _get_formats_and_subtitles(self, media_url, video_id): + formats, subtitles = [], {} + for url in variadic(media_url): + ext = determine_ext(url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(url, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) + return formats, subtitles + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['contentDetails'] + + media_url_list = traverse_obj(nextjs_data, (('rawContentUrl', 'url'), )) + formats, subtitles = self._get_formats_and_subtitles(media_url_list, display_id) + + return { + 'id': nextjs_data.get('id') or display_id, + 'title': nextjs_data.get('title') or self._html_search_meta('og:title', webpage), + 'formats': formats, + 'subtitles': subtitles, + 'description': (nextjs_data.get('description') or clean_html(nextjs_data.get('htmlDescription')) + or self._html_search_meta(['description', 'og:description'], webpage)), + 'thumbnail': nextjs_data.get('image') or self._html_search_meta('og:image', webpage), + 'timestamp': parse_iso8601(nextjs_data.get('createdAt')), + 'release_timestamp': parse_iso8601(nextjs_data.get('publishedAt')), + 'modified_timestamp': parse_iso8601( + nextjs_data.get('updatedAt') or self._html_search_meta('og:updated_time', webpage)), + 'duration': int_or_none(nextjs_data.get('duration')), + 'categories': traverse_obj(nextjs_data, ('genres', ..., 'name')), + 'season': nextjs_data.get('seasonName'), + 'season_number': int_or_none(nextjs_data.get('seasonNumber')), + 'channel': traverse_obj(nextjs_data, ('catalog', 'title')), + 'channel_id': traverse_obj(nextjs_data, ('catalog', 'id'), 'catalogId'), + **traverse_obj(nextjs_data, ('meta', 'aggregations', { + 'like_count': 'likes', + 'dislike_count': 'dislikes', + 'comment_count': 'comments', + 'channel_follower_count': 'followers', + })) + } From 839e2a62ae977ae51b1fcec50a8af3d28e1d230c Mon Sep 17 00:00:00 2001 From: MMM Date: Thu, 8 Dec 2022 17:32:17 +0100 Subject: [PATCH 026/153] [extractor/rumble] Add RumbleIE extractor (#5515) Closes #2846 Authored by: flashdagger --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rumble.py | 102 ++++++++++++++++++++++++++------ 2 files changed, 84 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c9dd7463c7..b1d0a9fb02 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1568,6 +1568,7 @@ from .rule34video import Rule34VideoIE from .rumble import ( RumbleEmbedIE, + RumbleIE, RumbleChannelIE, ) from .rutube import ( diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 102615c607..b7f798ffbb 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -4,11 +4,15 @@ from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, + UnsupportedError, + clean_html, + get_element_by_class, int_or_none, + parse_count, parse_iso8601, traverse_obj, unescapeHTML, - ExtractorError, ) @@ -111,24 +115,6 @@ class RumbleEmbedIE(InfoExtractor): }] _WEBPAGE_TESTS = [ - { - 'note': 'Rumble embed', - 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', - 'md5': '53af34098a7f92c4e51cf0bd1c33f009', - 'info_dict': { - 'id': 'vb0ofn', - 'ext': 'mp4', - 'timestamp': 1612662578, - 'uploader': 'LovingMontana', - 'channel': 'LovingMontana', - 'upload_date': '20210207', - 'title': 'Winter-loving dog helps girls dig a snow fort ', - 'channel_url': 'https://rumble.com/c/c-546523', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg', - 'duration': 103, - 'live_status': 'not_live', - } - }, { 'note': 'Rumble JS embed', 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it', @@ -235,6 +221,84 @@ def _real_extract(self, url): } +class RumbleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?Pv(?!ideos)[\w.-]+)[^/]*$' + _EMBED_REGEX = [r'/v[\w.-]+\.html)>'] + _TESTS = [{ + 'add_ie': ['RumbleEmbed'], + 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', + 'md5': '53af34098a7f92c4e51cf0bd1c33f009', + 'info_dict': { + 'id': 'vb0ofn', + 'ext': 'mp4', + 'timestamp': 1612662578, + 'uploader': 'LovingMontana', + 'channel': 'LovingMontana', + 'upload_date': '20210207', + 'title': 'Winter-loving dog helps girls dig a snow fort ', + 'description': 'Moose the dog is more than happy to help with digging out this epic snow fort. Great job, Moose!', + 'channel_url': 'https://rumble.com/c/c-546523', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 103, + 'like_count': int, + 'view_count': int, + 'live_status': 'not_live', + } + }, { + 'url': 'http://www.rumble.com/vDMUM1?key=value', + 'only_matching': True, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://rumble.com/videos?page=2', + 'playlist_count': 25, + 'info_dict': { + 'id': 'videos?page=2', + 'title': 'All videos', + 'description': 'Browse videos uploaded to Rumble.com', + 'age_limit': 0, + }, + }, { + 'url': 'https://rumble.com/live-videos', + 'playlist_mincount': 19, + 'info_dict': { + 'id': 'live-videos', + 'title': 'Live Videos', + 'description': 'Live videos on Rumble.com', + 'age_limit': 0, + }, + }, { + 'url': 'https://rumble.com/search/video?q=rumble&sort=views', + 'playlist_count': 24, + 'info_dict': { + 'id': 'video?q=rumble&sort=views', + 'title': 'Search results for: rumble', + 'age_limit': 0, + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + url_info = next(RumbleEmbedIE.extract_from_webpage(self._downloader, url, webpage), None) + if not url_info: + raise UnsupportedError(url) + + release_ts_str = self._search_regex( + r'(?:Livestream begins|Streamed on):\s+

([^<]+)\s+playlist\s*<', webpage, 'playlist title', + r'([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) From 153e88a75151a51cc2a2fbf02d62f66fc09b29d9 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Thu, 29 Dec 2022 17:12:07 +0900 Subject: [PATCH 067/153] [extractor/netverse] Add `NetverseSearch` extractor (#5838) Authored by: HobbyistDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/netverse.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 672eb95962..1b76d82643 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1160,6 +1160,7 @@ from .netverse import ( NetverseIE, NetversePlaylistIE, + NetverseSearchIE, ) from .newgrounds import ( NewgroundsIE, diff --git a/yt_dlp/extractor/netverse.py b/yt_dlp/extractor/netverse.py index 3c4fd92eb0..398198a1b0 100644 --- a/yt_dlp/extractor/netverse.py +++ b/yt_dlp/extractor/netverse.py @@ -1,6 +1,6 @@ import itertools -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from .dailymotion import DailymotionIE from ..utils import smuggle_url, traverse_obj @@ -251,3 +251,31 @@ def _real_extract(self, url): self.parse_playlist(playlist_data['response'], playlist_id), traverse_obj(playlist_data, ('response', 'webseries_info', 'slug')), traverse_obj(playlist_data, ('response', 'webseries_info', 'title'))) + + +class NetverseSearchIE(SearchInfoExtractor): + _SEARCH_KEY = 'netsearch' + + _TESTS = [{ + 'url': 'netsearch10:tetangga', + 'info_dict': { + 'id': 'tetangga', + 'title': 'tetangga', + }, + 'playlist_count': 10, + }] + + def _search_results(self, query): + last_page = None + for i in itertools.count(1): + search_data = self._download_json( + 'https://api.netverse.id/search/elastic/search', query, + query={'q': query, 'page': i}, note=f'Downloading page {i}') + + videos = traverse_obj(search_data, ('response', 'data', ...)) + for video in videos: + yield self.url_result(f'https://netverse.id/video/{video["slug"]}', NetverseIE) + + last_page = last_page or traverse_obj(search_data, ('response', 'lastpage')) + if not videos or i >= (last_page or 0): + break From 9a9006ba20f1f9f34183e1bde098c75502a018f8 Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 29 Dec 2022 06:15:38 -0500 Subject: [PATCH 068/153] [extractor/twitcasting] Fix videos with password (#5894) Closes #5888 Authored by: bashonly, Spicadox --- yt_dlp/extractor/twitcasting.py | 34 +++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 735cb0bb08..2548dae047 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -38,7 +38,7 @@ class TwitCastingIE(InfoExtractor): 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20110822', - 'timestamp': 1314010824, + 'timestamp': 1313978424, 'duration': 32, 'view_count': int, }, @@ -52,10 +52,10 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live playing something #3689740', 'uploader_id': 'mttbernardini', - 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', + 'description': 'md5:1dc7efa2f1ab932fcd119265cebeec69', 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20120212', - 'timestamp': 1329028024, + 'upload_date': '20120211', + 'timestamp': 1328995624, 'duration': 681, 'view_count': int, }, @@ -64,15 +64,22 @@ class TwitCastingIE(InfoExtractor): 'videopassword': 'abc', }, }, { - 'note': 'archive is split in 2 parts', 'url': 'https://twitcasting.tv/loft_heaven/movie/685979292', 'info_dict': { 'id': '685979292', 'ext': 'mp4', - 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', - 'duration': 6964.599334, + 'title': '【無料配信】南波一海のhear/here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', + 'uploader_id': 'loft_heaven', + 'description': 'md5:3a0c7b53019df987ce545c935538bacf', + 'upload_date': '20210604', + 'timestamp': 1622802114, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6964, + 'view_count': int, + }, + 'params': { + 'skip_download': True, }, - 'playlist_mincount': 2, }] def _parse_data_movie_playlist(self, dmp, video_id): @@ -88,15 +95,18 @@ def _parse_data_movie_playlist(self, dmp, video_id): def _real_extract(self, url): uploader_id, video_id = self._match_valid_url(url).groups() + webpage, urlh = self._download_webpage_handle(url, video_id) video_password = self.get_param('videopassword') request_data = None if video_password: request_data = urlencode_postdata({ 'password': video_password, + **self._hidden_inputs(webpage), }, encoding='utf-8') - webpage, urlh = self._download_webpage_handle( - url, video_id, data=request_data, - headers={'Origin': 'https://twitcasting.tv'}) + webpage, urlh = self._download_webpage_handle( + url, video_id, data=request_data, + headers={'Origin': 'https://twitcasting.tv'}, + note='Trying video password') if urlh.geturl() != url and request_data: webpage = self._download_webpage( urlh.geturl(), video_id, data=request_data, @@ -122,7 +132,7 @@ def _real_extract(self, url): duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000) or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage)))) view_count = str_to_int(self._search_regex( - (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*]+datetime="([^"]+)"', webpage, 'datetime', None)) From 3d667e0047915c32f5df9fdd86a4223dc0e9ce8f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 12:03:03 +0000 Subject: [PATCH 069/153] [extractor/slideslive] Support embeds and slides (#5784) Authored by: bashonly, Grub4K, pukkandan --- yt_dlp/extractor/slideslive.py | 390 ++++++++++++++++++++++++++++++--- 1 file changed, 362 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 86c26a8a2b..4268bfeaf1 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -1,16 +1,24 @@ +import re +import urllib.parse + from .common import InfoExtractor from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, smuggle_url, traverse_obj, unified_timestamp, + update_url_query, url_or_none, + xpath_text, ) class SlidesLiveIE(InfoExtractor): - _VALID_URL = r'https?://slideslive\.com/(?P[0-9]+)' + _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P[0-9]+)' _TESTS = [{ - # service_name = yoda + # service_name = yoda, only XML slides info 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', 'info_dict': { 'id': '38902413', @@ -19,12 +27,14 @@ class SlidesLiveIE(InfoExtractor): 'timestamp': 1648189972, 'upload_date': '20220325', 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:42', + 'chapters': 'count:41', }, 'params': { 'skip_download': 'm3u8', }, }, { - # service_name = yoda + # service_name = yoda, /v7/ slides 'url': 'https://slideslive.com/38935785', 'info_dict': { 'id': '38935785', @@ -32,13 +42,15 @@ class SlidesLiveIE(InfoExtractor): 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', 'upload_date': '20211115', 'timestamp': 1636996003, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:640', + 'chapters': 'count:639', }, 'params': { 'skip_download': 'm3u8', }, }, { - # service_name = yoda + # service_name = yoda, /v1/ slides 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics', 'info_dict': { 'id': '38973182', @@ -47,12 +59,14 @@ class SlidesLiveIE(InfoExtractor): 'upload_date': '20220201', 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1643728135, + 'thumbnails': 'count:3', + 'chapters': 'count:2', }, 'params': { 'skip_download': 'm3u8', }, }, { - # service_name = youtube + # service_name = youtube, only XML slides info 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost', 'md5': '8a79b5e3d700837f40bd2afca3c8fa01', 'info_dict': { @@ -76,26 +90,253 @@ class SlidesLiveIE(InfoExtractor): 'comment_count': int, 'channel_follower_count': int, 'age_limit': 0, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:169', 'playable_in_embed': True, 'availability': 'unlisted', 'tags': [], 'categories': ['People & Blogs'], + 'chapters': 'count:168', }, }, { - # service_name = youtube + # embed-only presentation, only XML slides info + 'url': 'https://slideslive.com/embed/presentation/38925850', + 'info_dict': { + 'id': '38925850', + 'ext': 'mp4', + 'title': 'Towards a Deep Network Architecture for Structured Smoothness', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:8', + 'timestamp': 1629671508, + 'upload_date': '20210822', + 'chapters': 'count:7', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # embed-only presentation, only JSON slides info, /v5/ slides (.png) + 'url': 'https://slideslive.com/38979920/', + 'info_dict': { + 'id': '38979920', + 'ext': 'mp4', + 'title': 'MoReL: Multi-omics Relational Learning', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:7', + 'timestamp': 1654714970, + 'upload_date': '20220608', + 'chapters': 'count:6', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v2/ slides (.jpg) + 'url': 'https://slideslive.com/38954074', + 'info_dict': { + 'id': '38954074', + 'ext': 'mp4', + 'title': 'Decentralized Attribution of Generative Models', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:16', + 'timestamp': 1622806321, + 'upload_date': '20210604', + 'chapters': 'count:15', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v4/ slides (.png) + 'url': 'https://slideslive.com/38979570/', + 'info_dict': { + 'id': '38979570', + 'ext': 'mp4', + 'title': 'Efficient Active Search for Combinatorial Optimization Problems', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:9', + 'timestamp': 1654714896, + 'upload_date': '20220608', + 'chapters': 'count:8', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v10/ slides + 'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F', + 'info_dict': { + 'id': '38979880', + 'ext': 'mp4', + 'title': 'The Representation Power of Neural Networks', + 'timestamp': 1654714962, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:22', + 'upload_date': '20220608', + 'chapters': 'count:21', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v7/ slides, 2 video slides + 'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com', + 'playlist_count': 3, + 'info_dict': { + 'id': '38979682-playlist', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models', + }, + 'playlist': [{ + 'info_dict': { + 'id': '38979682', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models', + 'timestamp': 1654714920, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'upload_date': '20220608', + 'chapters': 'count:31', + }, + }, { + 'info_dict': { + 'id': '38979682-021', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021', + 'duration': 3, + 'timestamp': 1654714920, + 'upload_date': '20220608', + }, + }, { + 'info_dict': { + 'id': '38979682-024', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024', + 'duration': 4, + 'timestamp': 1654714920, + 'upload_date': '20220608', + }, + }], + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v6/ slides, 1 video slide, edit.videoken.com embed + 'url': 'https://slideslive.com/38979481/', + 'playlist_count': 2, + 'info_dict': { + 'id': '38979481-playlist', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification', + }, + 'playlist': [{ + 'info_dict': { + 'id': '38979481', + 'ext': 'mp4', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification', + 'timestamp': 1654714877, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:43', + 'upload_date': '20220608', + 'chapters': 'count:43', + }, + }, { + 'info_dict': { + 'id': '38979481-013', + 'ext': 'mp4', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013', + 'duration': 3, + 'timestamp': 1654714877, + 'upload_date': '20220608', + }, + }], + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v3/ slides, .jpg and .png, service_name = youtube + 'url': 'https://slideslive.com/embed/38932460/', + 'info_dict': { + 'id': 'RTPdrgkyTiE', + 'display_id': '38932460', + 'ext': 'mp4', + 'title': 'Active Learning for Hierarchical Multi-Label Classification', + 'description': 'Watch full version of this video at https://slideslive.com/38932460.', + 'channel': 'SlidesLive Videos - A', + 'channel_id': 'UC62SdArr41t_-_fX40QCLRw', + 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', + 'uploader': 'SlidesLive Videos - A', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'uploader_url': 'http://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20200903', + 'timestamp': 1602599092, + 'duration': 942, + 'age_limit': 0, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'unlisted', + 'categories': ['People & Blogs'], + 'tags': [], + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)', + 'thumbnails': 'count:21', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # service_name = yoda 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', 'only_matching': True, }, { - # service_name = url + # dead link, service_name = url 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', 'only_matching': True, }, { - # service_name = vimeo + # dead link, service_name = vimeo 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + # only XML slides info + 'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html', + 'info_dict': { + 'id': '38925850', + 'ext': 'mp4', + 'title': 'Towards a Deep Network Architecture for Structured Smoothness', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:8', + 'timestamp': 1629671508, + 'upload_date': '20210822', + 'chapters': 'count:7', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Reference: https://slideslive.com/embed_presentation.js + for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage): + url_parsed = urllib.parse.urlparse(url) + origin = f'{url_parsed.scheme}://{url_parsed.netloc}' + yield update_url_query( + f'https://slideslive.com/embed/presentation/{embed_id}', { + 'embed_parent_url': url, + 'embed_container_origin': origin, + }) + + def _download_embed_webpage_handle(self, video_id, headers): + return self._download_webpage_handle( + f'https://slideslive.com/embed/presentation/{video_id}', video_id, + headers=headers, query=traverse_obj(headers, { + 'embed_parent_url': 'Referer', + 'embed_container_origin': 'Origin', + })) + def _extract_custom_m3u8_info(self, m3u8_data): m3u8_dict = {} @@ -108,6 +349,8 @@ def _extract_custom_m3u8_info(self, m3u8_data): 'VOD-VIDEO-ID': 'service_id', 'VOD-VIDEO-SERVERS': 'video_servers', 'VOD-SUBTITLES': 'subtitles', + 'VOD-SLIDES-JSON-URL': 'slides_json_url', + 'VOD-SLIDES-XML-URL': 'slides_xml_url', } for line in m3u8_data.splitlines(): @@ -126,9 +369,33 @@ def _extract_custom_m3u8_info(self, m3u8_data): return m3u8_dict + def _extract_formats(self, cdn_hostname, path, video_id): + formats = [] + formats.extend(self._extract_m3u8_formats( + f'https://{cdn_hostname}/{path}/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)) + formats.extend(self._extract_mpd_formats( + f'https://{cdn_hostname}/{path}/master.mpd', + video_id, mpd_id='dash', fatal=False)) + return formats + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage, urlh = self._download_embed_webpage_handle( + video_id, headers=traverse_obj(parse_qs(url), { + 'Referer': ('embed_parent_url', -1), + 'Origin': ('embed_container_origin', -1)})) + redirect_url = urlh.geturl() + if 'domain_not_allowed' in redirect_url: + domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False) + if not domain: + raise ExtractorError( + 'This is an embed-only presentation. Try passing --referer', expected=True) + webpage, _ = self._download_embed_webpage_handle(video_id, headers={ + 'Referer': f'https://{domain}/', + 'Origin': f'https://{domain}', + }) + player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token') player_data = self._download_webpage( f'https://ben.slideslive.com/player/{video_id}', video_id, @@ -139,6 +406,50 @@ def _real_extract(self, url): assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = player_info['service_id'] + slides_info_url = None + slides, slides_info = [], [] + if player_info.get('slides_json_url'): + slides_info_url = player_info['slides_json_url'] + slides = traverse_obj(self._download_json( + slides_info_url, video_id, fatal=False, + note='Downloading slides JSON', errnote=False), 'slides', expected_type=list) or [] + for slide_id, slide in enumerate(slides, start=1): + slides_info.append(( + slide_id, traverse_obj(slide, ('image', 'name')), + int_or_none(slide.get('time'), scale=1000))) + + if not slides and player_info.get('slides_xml_url'): + slides_info_url = player_info['slides_xml_url'] + slides = self._download_xml( + slides_info_url, video_id, fatal=False, + note='Downloading slides XML', errnote='Failed to download slides info') + for slide_id, slide in enumerate(slides.findall('./slide'), start=1): + slides_info.append(( + slide_id, xpath_text(slide, './slideName', 'name'), + int_or_none(xpath_text(slide, './timeSec', 'time')))) + + slides_version = int(self._search_regex( + r'https?://slides\.slideslive\.com/\d+/v(\d+)/\w+\.(?:json|xml)', + slides_info_url, 'slides version', default=0)) + if slides_version < 4: + slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s.jpg' + else: + slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s.png' + + chapters, thumbnails = [], [] + if url_or_none(player_info.get('thumbnail')): + thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']}) + for slide_id, slide_path, start_time in slides_info: + if slide_path: + thumbnails.append({ + 'id': f'{slide_id:03d}', + 'url': slide_url_template % (video_id, slide_path), + }) + chapters.append({ + 'title': f'Slide {slide_id:03d}', + 'start_time': start_time, + }) + subtitles = {} for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict): webvtt_url = url_or_none(sub.get('webvtt_url')) @@ -154,25 +465,15 @@ def _real_extract(self, url): 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''), 'timestamp': unified_timestamp(player_info.get('timestamp')), 'is_live': player_info.get('playlist_type') != 'vod', - 'thumbnail': url_or_none(player_info.get('thumbnail')), + 'thumbnails': thumbnails, + 'chapters': chapters, 'subtitles': subtitles, } - if service_name in ('url', 'yoda'): - if service_name == 'url': - info['url'] = service_id - else: - cdn_hostname = player_info['video_servers'][0] - formats = [] - formats.extend(self._extract_m3u8_formats( - f'https://{cdn_hostname}/{service_id}/master.m3u8', - video_id, 'mp4', m3u8_id='hls', fatal=False, live=True)) - formats.extend(self._extract_mpd_formats( - f'https://{cdn_hostname}/{service_id}/master.mpd', - video_id, mpd_id='dash', fatal=False)) - info.update({ - 'formats': formats, - }) + if service_name == 'url': + info['url'] = service_id + elif service_name == 'yoda': + info['formats'] = self._extract_formats(player_info['video_servers'][0], service_id, video_id) else: info.update({ '_type': 'url_transparent', @@ -185,4 +486,37 @@ def _real_extract(self, url): f'https://player.vimeo.com/video/{service_id}', {'http_headers': {'Referer': url}}) - return info + video_slides = traverse_obj(slides, (..., 'video', 'id')) + if not video_slides: + return info + + def entries(): + yield info + + service_data = self._download_json( + f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data', + video_id, fatal=False, query={ + 'player_token': player_token, + 'videos': ','.join(video_slides), + }, note='Downloading video slides info', errnote='Failed to download video slides info') or {} + + for slide_id, slide in enumerate(slides, 1): + if not traverse_obj(slide, ('video', 'service')) == 'yoda': + continue + video_path = traverse_obj(slide, ('video', 'id')) + cdn_hostname = traverse_obj(service_data, ( + video_path, 'video_servers', ...), get_all=False) + if not cdn_hostname or not video_path: + continue + formats = self._extract_formats(cdn_hostname, video_path, video_id) + if not formats: + continue + yield { + 'id': f'{video_id}-{slide_id:03d}', + 'title': f'{info["title"]} - Slide {slide_id:03d}', + 'timestamp': info['timestamp'], + 'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000), + 'formats': formats, + } + + return self.playlist_result(entries(), f'{video_id}-playlist', info['title']) From 4b183d49620e564219c01714ca8639199f6b1cc0 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 14:29:08 +0000 Subject: [PATCH 070/153] [extractor/videoken] Add extractors (#5824) Closes #5818 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 7 + yt_dlp/extractor/videoken.py | 336 ++++++++++++++++++++++++++++++++ 2 files changed, 343 insertions(+) create mode 100644 yt_dlp/extractor/videoken.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1b76d82643..e51228afff 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2097,6 +2097,13 @@ ) from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videoken import ( + VideoKenIE, + VideoKenPlayerIE, + VideoKenPlaylistIE, + VideoKenCategoryIE, + VideoKenTopicIE, +) from .videomore import ( VideomoreIE, VideomoreVideoIE, diff --git a/yt_dlp/extractor/videoken.py b/yt_dlp/extractor/videoken.py new file mode 100644 index 0000000000..560b41a6d7 --- /dev/null +++ b/yt_dlp/extractor/videoken.py @@ -0,0 +1,336 @@ +import base64 +import functools +import math +import re +import time +import urllib.parse + +from .common import InfoExtractor +from .slideslive import SlidesLiveIE +from ..utils import ( + ExtractorError, + InAdvancePagedList, + int_or_none, + traverse_obj, + update_url_query, + url_or_none, +) + + +class VideoKenBaseIE(InfoExtractor): + _ORGANIZATIONS = { + 'videos.icts.res.in': 'icts', + 'videos.cncf.io': 'cncf', + 'videos.neurips.cc': 'neurips', + } + _BASE_URL_RE = rf'https?://(?P{"|".join(map(re.escape, _ORGANIZATIONS))})/' + + _PAGE_SIZE = 12 + + def _get_org_id_and_api_key(self, org, video_id): + details = self._download_json( + f'https://analytics.videoken.com/api/videolake/{org}/details', video_id, + note='Downloading organization ID and API key', headers={ + 'Accept': 'application/json', + }) + return details['id'], details['apikey'] + + def _create_slideslive_url(self, video_url, video_id, referer): + if not video_url and not video_id: + return + elif not video_url or 'embed/sign-in' in video_url: + video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}' + if url_or_none(referer): + return update_url_query(video_url, { + 'embed_parent_url': referer, + 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}', + }) + return video_url + + def _extract_videos(self, videos, url): + for video in traverse_obj(videos, (('videos', 'results'), ...)): + video_id = traverse_obj(video, 'youtube_id', 'videoid') + if not video_id: + continue + ie_key = None + if traverse_obj(video, 'type', 'source') == 'youtube': + video_url = video_id + ie_key = 'Youtube' + else: + video_url = traverse_obj(video, 'embed_url', 'embeddableurl') + if urllib.parse.urlparse(video_url).netloc == 'slideslive.com': + ie_key = SlidesLiveIE + video_url = self._create_slideslive_url(video_url, video_id, url) + if not video_url: + continue + yield self.url_result(video_url, ie_key, video_id) + + +class VideoKenIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P[\w-]+)' + _TESTS = [{ + # neurips -> videoken -> slideslive + 'url': 'https://videos.neurips.cc/video/slideslive-38922815', + 'info_dict': { + 'id': '38922815', + 'ext': 'mp4', + 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures', + 'timestamp': 1630939331, + 'upload_date': '20210906', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:330', + 'chapters': 'count:329', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # neurips -> videoken -> slideslive -> youtube + 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348', + 'info_dict': { + 'id': '2Xa_dt78rJE', + 'ext': 'mp4', + 'display_id': '38923348', + 'title': 'Machine Education', + 'description': 'Watch full version of this video at https://slideslive.com/38923348.', + 'channel': 'SlidesLive Videos - G2', + 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'uploader': 'SlidesLive Videos - G2', + 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'duration': 2504, + 'timestamp': 1618922125, + 'upload_date': '20200131', + 'age_limit': 0, + 'channel_follower_count': int, + 'view_count': int, + 'availability': 'unlisted', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'categories': ['People & Blogs'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:78', + 'chapters': 'count:77', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # icts -> videoken -> youtube + 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc', + 'info_dict': { + 'id': 'zysIsojYdvc', + 'ext': 'mp4', + 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad', + 'description': 'md5:87433069d79719eeadc1962cc2ace00b', + 'channel': 'International Centre for Theoretical Sciences', + 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ', + 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ', + 'uploader': 'International Centre for Theoretical Sciences', + 'uploader_id': 'ICTStalks', + 'uploader_url': 'http://www.youtube.com/user/ICTStalks', + 'duration': 3372, + 'upload_date': '20191004', + 'age_limit': 0, + 'live_status': 'not_live', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'categories': ['Science & Technology'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:42', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8', + 'only_matching': True, + }, { + 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI', + 'only_matching': True, + }, { + 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU', + 'only_matching': True, + }] + + def _real_extract(self, url): + hostname, video_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id) + details = self._download_json( + 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={ + 'videoid': video_id, + 'org_id': org_id, + }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON', + errnote='Failed to download VideoKen API JSON', fatal=False) + if details: + return next(self._extract_videos({'videos': [details]}, url)) + # fallback for API error 400 response + elif video_id.startswith('slideslive-'): + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + elif re.match(r'^[\w-]{11}$', video_id): + self.url_result(video_id, 'Youtube', video_id) + else: + raise ExtractorError('Unable to extract without VideoKen API response') + + +class VideoKenPlayerIE(VideoKenBaseIE): + _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P\d+)' + _TESTS = [{ + 'url': 'https://player.videoken.com/embed/slideslive-38968434', + 'info_dict': { + 'id': '38968434', + 'ext': 'mp4', + 'title': 'Deep Learning with Label Differential Privacy', + 'timestamp': 1643377020, + 'upload_date': '20220128', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'chapters': 'count:29', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + + +class VideoKenPlaylistIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P\d+)' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/playlist/381', + 'playlist_mincount': 117, + 'info_dict': { + 'id': '381', + 'title': 'Cosmology - The Next Decade', + }, + }] + + def _real_extract(self, url): + hostname, playlist_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id) + videos = self._download_json( + f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/', + playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON') + return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title')) + + +class VideoKenCategoryIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P\d+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': '1822', + 'title': 'Programs', + }, + }, { + 'url': 'https://videos.neurips.cc/category/350/', + 'playlist_mincount': 34, + 'info_dict': { + 'id': '350', + 'title': 'NeurIPS 2018', + }, + }, { + 'url': 'https://videos.cncf.io/category/479/', + 'playlist_mincount': 328, + 'info_dict': { + 'id': '479', + 'title': 'KubeCon + CloudNativeCon Europe\'19', + }, + }] + + def _get_category_page(self, category_id, org_id, page=1, note=None): + return self._download_json( + f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id, + fatal=False, note=note if note else f'Downloading category page {page}', + query={ + 'category_id': category_id, + 'page_number': page, + 'length': self._PAGE_SIZE, + }, headers={'Accept': 'application/json'}) or {} + + def _entries(self, category_id, org_id, url, page): + videos = self._get_category_page(category_id, org_id, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, category_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id) + category_info = self._get_category_page(category_id, org_id, note='Downloading category info') + category = category_info['category_name'] + total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, category_id, org_id, url), + total_pages, self._PAGE_SIZE), category_id, category) + + +class VideoKenTopicIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P[^/#?]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.neurips.cc/topic/machine%20learning/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': 'machine_learning', + 'title': 'machine learning', + }, + }, { + 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/', + 'playlist_mincount': 77, + 'info_dict': { + 'id': 'gravitational_waves', + 'title': 'gravitational waves' + }, + }, { + 'url': 'https://videos.cncf.io/topic/prometheus/', + 'playlist_mincount': 134, + 'info_dict': { + 'id': 'prometheus', + 'title': 'prometheus', + }, + }] + + def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None): + return self._download_json( + 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={ + 'orgid': org_id, + 'size': self._PAGE_SIZE, + 'query': topic, + 'page': page, + 'sort': 'upload_desc', + 'filter': 'all', + 'token': api_key, + 'is_topic': 'true', + 'category': '', + 'searchid': search_id, + }, headers={'Accept': 'application/json'}, + note=note if note else f'Downloading topic page {page}') or {} + + def _entries(self, topic, org_id, search_id, api_key, url, page): + videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, topic_id = self._match_valid_url(url).group('host', 'id') + topic = urllib.parse.unquote(topic_id) + topic_id = topic.replace(' ', '_') + org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic) + search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode() + total_pages = int_or_none(self._get_topic_page( + topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages']) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, topic, org_id, search_id, api_key, url), + total_pages, self._PAGE_SIZE), topic_id, topic) From 53006b35ea8b26ff31a96a423ddaa3304d0a124e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 15:04:09 +0000 Subject: [PATCH 071/153] [extractor/amazon] Add `AmazonReviews` extractor (#5857) Closes #5766 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/amazon.py | 116 ++++++++++++++++++++++++++++++-- 2 files changed, 113 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e51228afff..4fed24c35b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -87,7 +87,10 @@ AluraCourseIE ) from .amcnetworks import AMCNetworksIE -from .amazon import AmazonStoreIE +from .amazon import ( + AmazonStoreIE, + AmazonReviewsIE, +) from .amazonminitv import ( AmazonMiniTVIE, AmazonMiniTVSeasonIE, diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py index 4d3170683a..a03f983e0e 100644 --- a/yt_dlp/extractor/amazon.py +++ b/yt_dlp/extractor/amazon.py @@ -1,5 +1,17 @@ +import re + from .common import InfoExtractor -from ..utils import ExtractorError, int_or_none +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_attribute, + get_element_by_class, + int_or_none, + js_to_json, + traverse_obj, + url_or_none, +) class AmazonStoreIE(InfoExtractor): @@ -9,7 +21,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', + 'title': str, }, 'playlist_mincount': 1, 'playlist': [{ @@ -20,28 +32,32 @@ class AmazonStoreIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 34, }, - }] + }], + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:d1d3352428f8f015706c84b31e132169', + 'title': str, }, 'playlist_mincount': 4, + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + 'title': str, }, 'playlist-mincount': 1, + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', 'info_dict': { 'id': 'B08WX337PQ', - 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + 'title': str, }, 'playlist_mincount': 1, + 'expected_warnings': ['Unable to extract data'], }] def _real_extract(self, url): @@ -52,7 +68,7 @@ def _real_extract(self, url): try: data_json = self._search_json( r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, - transform_source=lambda x: x.replace(R'\\u', R'\u')) + transform_source=js_to_json) except ExtractorError as e: retry.error = e @@ -66,3 +82,89 @@ def _real_extract(self, url): 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) + + +class AmazonReviewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P[^/&#$?]+)' + _TESTS = [{ + 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl', + 'info_dict': { + 'id': 'R10VE9VUSY19L3', + 'ext': 'mp4', + 'title': 'Get squad #Suspicious', + 'description': 'md5:7012695052f440a1e064e402d87e0afb', + 'uploader': 'Kimberly Cronkright', + 'average_rating': 1.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }, { + 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US', + 'info_dict': { + 'id': 'R10VE9VUSY19L3', + 'ext': 'mp4', + 'title': 'Get squad #Suspicious', + 'description': 'md5:7012695052f440a1e064e402d87e0afb', + 'uploader': 'Kimberly Cronkright', + 'average_rating': 1.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }, { + 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/', + 'info_dict': { + 'id': 'RV1CO8JN5VGXV', + 'ext': 'mp4', + 'title': 'Not sure about its durability', + 'description': 'md5:1a252c106357f0a3109ebf37d2e87494', + 'uploader': 'Shoaib Gulzar', + 'average_rating': 2.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + for retry in self.RetryManager(): + webpage = self._download_webpage(url, video_id) + review_body = get_element_by_attribute('data-hook', 'review-body', webpage) + if not review_body: + retry.error = ExtractorError('Review body was not found in webpage', expected=True) + + formats, subtitles = [], {} + + manifest_url = self._search_regex( + r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None) + if url_or_none(manifest_url): + fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + manifest_url, video_id, 'mp4', fatal=False) + formats.extend(fmts) + + video_url = self._search_regex( + r']+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None) + if url_or_none(video_url): + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'http-mp4', + }) + + if not formats: + self.raise_no_formats('No video found for this customer review', expected=True) + + return { + 'id': video_id, + 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage)) + or self._html_extract_title(webpage)), + 'description': clean_html(traverse_obj(re.findall( + r'(.+?)', review_body), -1)), + 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)), + 'average_rating': float_or_none(clean_html(get_element_by_attribute( + 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]), + 'thumbnail': self._search_regex( + r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None), + 'formats': formats, + 'subtitles': subtitles, + } From 2647c933b8ed22f95dd8e9866c4db031867a1bc8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 16:32:54 +0000 Subject: [PATCH 072/153] [extractor/wistia] Improve extension detection (#5415) Closes #5053 Authored by: bashonly, Grub4k, pukkandan --- yt_dlp/extractor/wistia.py | 41 ++++++++----- yt_dlp/utils.py | 122 +++++++++++++++++++++++-------------- 2 files changed, 104 insertions(+), 59 deletions(-) diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index 38dcc2f5b5..884fa4b5fd 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -6,12 +6,15 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + HEADRequest, + determine_ext, float_or_none, int_or_none, parse_qs, traverse_obj, try_get, update_url_query, + urlhandle_detect_ext, ) @@ -34,6 +37,16 @@ def _download_embed_config(self, config_type, config_id, referer): return embed_config + def _get_real_ext(self, url): + ext = determine_ext(url, default_ext='bin') + if ext == 'bin': + urlh = self._request_webpage( + HEADRequest(url), None, note='Checking media extension', + errnote='HEAD request returned error', fatal=False) + if urlh: + ext = urlhandle_detect_ext(urlh, default='bin') + return 'mp4' if ext == 'mov' else ext + def _extract_media(self, embed_config): data = embed_config['media'] video_id = data['hashedId'] @@ -51,13 +64,13 @@ def _extract_media(self, embed_config): continue elif atype in ('still', 'still_image'): thumbnails.append({ - 'url': aurl, + 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'), 'width': int_or_none(a.get('width')), 'height': int_or_none(a.get('height')), 'filesize': int_or_none(a.get('size')), }) else: - aext = a.get('ext') + aext = a.get('ext') or self._get_real_ext(aurl) display_name = a.get('display_name') format_id = atype if atype and atype.endswith('_video') and display_name: @@ -169,26 +182,26 @@ class WistiaIE(WistiaBaseIE): 'md5': '10c1ce9c4dde638202513ed17a3767bd', 'info_dict': { 'id': 'a6ndpko1wg', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'Episode 2: Boxed Water\'s retention is thirsty', 'upload_date': '20210324', 'description': 'md5:da5994c2c2d254833b412469d9666b7a', 'duration': 966.0, 'timestamp': 1616614369, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png', } }, { 'url': 'wistia:5vd7p4bct5', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$', + 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$', }, }, { 'url': 'wistia:sh7fpupwlt', @@ -208,25 +221,25 @@ class WistiaIE(WistiaBaseIE): 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', 'info_dict': { 'id': 'cqwukac3z1', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', 'duration': 158.125, 'timestamp': 1618974400, 'description': 'md5:27abc99a758573560be72600ef95cece', 'upload_date': '20210421', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg', } }, { 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg', 'description': 'a Paywall Videos video', }, }] @@ -302,9 +315,9 @@ class WistiaChannelIE(WistiaBaseIE): 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', 'info_dict': { 'id': 'sp5dqjzw3n', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'The Roof S2: The Modern CRO', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png', 'duration': 86.487, 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', 'timestamp': 1619790290, @@ -334,12 +347,12 @@ class WistiaChannelIE(WistiaBaseIE): 'info_dict': { 'id': 'pz0m0l0if3', 'title': 'A Framework for Improving Product Team Performance', - 'ext': 'bin', + 'ext': 'mp4', 'timestamp': 1653935275, 'upload_date': '20220530', 'description': 'Learn how to help your company improve and achieve your product related goals.', 'duration': 1854.39, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png', }, 'params': {'noplaylist': True, 'skip_download': True}, }] diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 65408bf19b..3947dcf2e5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3480,67 +3480,93 @@ def error_to_str(err): return f'{type(err).__name__}: {err}' -def mimetype2ext(mt): - if mt is None: +def mimetype2ext(mt, default=NO_DEFAULT): + if not isinstance(mt, str): + if default is not NO_DEFAULT: + return default return None - mt, _, params = mt.partition(';') - mt = mt.strip() - - FULL_MAP = { - 'audio/mp4': 'm4a', - # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as - # it's the most popular one - 'audio/mpeg': 'mp3', - 'audio/x-wav': 'wav', - 'audio/wav': 'wav', - 'audio/wave': 'wav', - } - - ext = FULL_MAP.get(mt) - if ext is not None: - return ext - - SUBTYPE_MAP = { + MAP = { + # video '3gpp': '3gp', - 'smptett+xml': 'tt', - 'ttaf+xml': 'dfxp', - 'ttml+xml': 'ttml', - 'x-flv': 'flv', - 'x-mp4-fragmented': 'mp4', - 'x-ms-sami': 'sami', - 'x-ms-wmv': 'wmv', + 'mp2t': 'ts', + 'mp4': 'mp4', + 'mpeg': 'mpeg', 'mpegurl': 'm3u8', - 'x-mpegurl': 'm3u8', - 'vnd.apple.mpegurl': 'm3u8', + 'quicktime': 'mov', + 'webm': 'webm', + 'vp9': 'vp9', + 'x-flv': 'flv', + 'x-m4v': 'm4v', + 'x-matroska': 'mkv', + 'x-mng': 'mng', + 'x-mp4-fragmented': 'mp4', + 'x-ms-asf': 'asf', + 'x-ms-wmv': 'wmv', + 'x-msvideo': 'avi', + + # application (streaming playlists) 'dash+xml': 'mpd', 'f4m+xml': 'f4m', 'hds+xml': 'f4m', + 'vnd.apple.mpegurl': 'm3u8', 'vnd.ms-sstr+xml': 'ism', - 'quicktime': 'mov', - 'mp2t': 'ts', + 'x-mpegurl': 'm3u8', + + # audio + 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. + # Using .mp3 as it's the most popular one + 'audio/mpeg': 'mp3', + 'audio/webm': 'weba', + 'audio/x-matroska': 'mka', + 'audio/x-mpegurl': 'm3u', + 'midi': 'mid', + 'ogg': 'ogg', + 'wav': 'wav', + 'wave': 'wav', + 'x-aac': 'aac', + 'x-flac': 'flac', + 'x-m4a': 'm4a', + 'x-realaudio': 'ra', 'x-wav': 'wav', - 'filmstrip+json': 'fs', + + # image + 'avif': 'avif', + 'bmp': 'bmp', + 'gif': 'gif', + 'jpeg': 'jpg', + 'png': 'png', 'svg+xml': 'svg', - } + 'tiff': 'tif', + 'vnd.wap.wbmp': 'wbmp', + 'webp': 'webp', + 'x-icon': 'ico', + 'x-jng': 'jng', + 'x-ms-bmp': 'bmp', - _, _, subtype = mt.rpartition('/') - ext = SUBTYPE_MAP.get(subtype.lower()) - if ext is not None: - return ext + # caption + 'filmstrip+json': 'fs', + 'smptett+xml': 'tt', + 'ttaf+xml': 'dfxp', + 'ttml+xml': 'ttml', + 'x-ms-sami': 'sami', - SUFFIX_MAP = { + # misc + 'gzip': 'gz', 'json': 'json', 'xml': 'xml', 'zip': 'zip', - 'gzip': 'gz', } - _, _, suffix = subtype.partition('+') - ext = SUFFIX_MAP.get(suffix) - if ext is not None: - return ext + mimetype = mt.partition(';')[0].strip().lower() + _, _, subtype = mimetype.rpartition('/') + ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) + if ext: + return ext + elif default is not NO_DEFAULT: + return default return subtype.replace('+', '.') @@ -3634,7 +3660,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): return 'mkv' if allow_mkv else preferences[-1] -def urlhandle_detect_ext(url_handle): +def urlhandle_detect_ext(url_handle, default=NO_DEFAULT): getheader = url_handle.headers.get cd = getheader('Content-Disposition') @@ -3645,7 +3671,13 @@ def urlhandle_detect_ext(url_handle): if e: return e - return mimetype2ext(getheader('Content-Type')) + meta_ext = getheader('x-amz-meta-name') + if meta_ext: + e = meta_ext.rpartition('.')[2] + if e: + return e + + return mimetype2ext(getheader('Content-Type'), default=default) def encode_data_uri(data, mime_type): From c1edb853b0a0cc69ea08337c0c5aee669b26d3d2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 17:31:01 +0000 Subject: [PATCH 073/153] [extractor/kick] Add extractor (#5736) Closes #5722 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/kick.py | 127 ++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 yt_dlp/extractor/kick.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4fed24c35b..a2b92b85ae 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -844,6 +844,10 @@ KhanAcademyIE, KhanAcademyUnitIE, ) +from .kick import ( + KickIE, + KickVODIE, +) from .kicker import KickerIE from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py new file mode 100644 index 0000000000..a79ffb7a98 --- /dev/null +++ b/yt_dlp/extractor/kick.py @@ -0,0 +1,127 @@ +from .common import InfoExtractor + +from ..utils import ( + HEADRequest, + UserNotLive, + float_or_none, + merge_dicts, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class KickBaseIE(InfoExtractor): + def _real_initialize(self): + self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session') + xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') + if not xsrf_token: + self.write_debug('kick.com did not set XSRF-TOKEN cookie') + KickBaseIE._API_HEADERS = { + 'Authorization': f'Bearer {xsrf_token.value}', + 'X-XSRF-TOKEN': xsrf_token.value, + } if xsrf_token else {} + + def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): + return self._download_json( + f'https://kick.com/api/v1/{path}', display_id, note=note, + headers=merge_dicts(headers, self._API_HEADERS), **kwargs) + + +class KickIE(KickBaseIE): + _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P[\w_]+)' + _TESTS = [{ + 'url': 'https://kick.com/yuppy', + 'info_dict': { + 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21', + 'ext': 'mp4', + 'title': str, + 'description': str, + 'channel': 'yuppy', + 'channel_id': '33538', + 'uploader': 'Yuppy', + 'uploader_id': '33793', + 'upload_date': str, + 'live_status': 'is_live', + 'timestamp': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'categories': list, + }, + 'skip': 'livestream', + }, { + 'url': 'https://kick.com/kmack710', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + response = self._call_api(f'channels/{channel}', channel) + if not traverse_obj(response, 'livestream', expected_type=dict): + raise UserNotLive(video_id=channel) + + return { + 'id': str(traverse_obj( + response, ('livestream', ('slug', 'id')), get_all=False, default=channel)), + 'formats': self._extract_m3u8_formats( + response['playback_url'], channel, 'mp4', live=True), + 'title': traverse_obj( + response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), + 'description': traverse_obj(response, ('user', 'bio')), + 'channel': channel, + 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))), + 'uploader': traverse_obj(response, 'name', ('user', 'username')), + 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))), + 'is_live': True, + 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))), + 'thumbnail': traverse_obj( + response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none), + 'categories': traverse_obj(response, ('recent_categories', ..., 'name')), + } + + +class KickVODIE(KickBaseIE): + _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35', + 'md5': '73691206a6a49db25c5aa1588e6538fc', + 'info_dict': { + 'id': '54244b5e-050a-4df4-a013-b2433dafbe35', + 'ext': 'mp4', + 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links', + 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f', + 'channel': 'kmack710', + 'channel_id': '16278', + 'uploader': 'Kmack710', + 'uploader_id': '16412', + 'upload_date': '20221206', + 'timestamp': 1670318289, + 'duration': 40104.0, + 'thumbnail': r're:^https?://.*\.jpg', + 'categories': ['Grand Theft Auto V'], + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + response = self._call_api(f'video/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'), + 'title': traverse_obj( + response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), + 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')), + 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')), + 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))), + 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')), + 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))), + 'timestamp': unified_timestamp(response.get('created_at')), + 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000), + 'thumbnail': traverse_obj( + response, ('livestream', 'thumbnail'), expected_type=url_or_none), + 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')), + } From ca2f6e14e65f0faf92cabff8b7e5b4760363c52e Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Fri, 30 Dec 2022 03:01:22 +0900 Subject: [PATCH 074/153] [extractor/BiliLive] Fix extractor - Remove unnecessary group in `_VALID_URL` - This extractor always returns livestreams --- yt_dlp/extractor/bilibili.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 616a549607..37711c138a 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1034,7 +1034,7 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(blanc/)?(?P\d+)' + _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', @@ -1114,6 +1114,7 @@ def _real_extract(self, url): 'thumbnail': room_data.get('user_cover'), 'timestamp': stream_data.get('live_time'), 'formats': formats, + 'is_live': True, 'http_headers': { 'Referer': url, }, From e107c2b8cf8d6f3506d07bc64fc243682ee49b1e Mon Sep 17 00:00:00 2001 From: nosoop Date: Thu, 29 Dec 2022 10:46:43 -0800 Subject: [PATCH 075/153] [extractor/soundcloud] Support user permalink (#5842) Closes #5841 Authored by: nosoop --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/soundcloud.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a2b92b85ae..352de83cac 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1710,6 +1710,7 @@ SoundcloudSetIE, SoundcloudRelatedIE, SoundcloudUserIE, + SoundcloudUserPermalinkIE, SoundcloudTrackStationIE, SoundcloudPlaylistIE, SoundcloudSearchIE, diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 4879d48c80..979f23f44f 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -782,6 +782,27 @@ def _real_extract(self, url): '%s (%s)' % (user['username'], resource.capitalize())) +class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://api\.soundcloud\.com/users/(?P\d+)' + IE_NAME = 'soundcloud:user:permalink' + _TESTS = [{ + 'url': 'https://api.soundcloud.com/users/30909869', + 'info_dict': { + 'id': '30909869', + 'title': 'neilcic', + }, + 'playlist_mincount': 23, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + user = self._download_json( + self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS) + + return self._extract_playlist( + f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username')) + + class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P[^/?#&]+)' IE_NAME = 'soundcloud:trackstation' From efa944f4bc892321a0d01dcddb210405761ecada Mon Sep 17 00:00:00 2001 From: Anant Murmu Date: Fri, 30 Dec 2022 08:13:49 +0530 Subject: [PATCH 076/153] [cleanup] Use `random.choices` (#5800) Authored by: freezboltz --- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/adn.py | 2 +- yt_dlp/extractor/discovery.py | 2 +- yt_dlp/extractor/funimation.py | 2 +- yt_dlp/extractor/linuxacademy.py | 5 ++--- yt_dlp/extractor/tencent.py | 4 ++-- yt_dlp/extractor/tiktok.py | 10 +++++----- yt_dlp/extractor/videa.py | 2 +- yt_dlp/extractor/viu.py | 2 +- yt_dlp/extractor/vrv.py | 2 +- yt_dlp/extractor/youku.py | 4 ++-- 11 files changed, 18 insertions(+), 19 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index abb0ddfe52..17f37a6432 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1068,7 +1068,7 @@ def _outtmpl_expandpath(outtmpl): # correspondingly that is not what we want since we need to keep # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. - sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) + sep = ''.join(random.choices(ascii_letters, k=32)) outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index e0c18c8773..f1f55e87fc 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -168,7 +168,7 @@ def _real_extract(self, url): }, data=b'')['token'] links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + self._K = ''.join(random.choices('0123456789abcdef', k=16)) message = bytes_to_intlist(json.dumps({ 'k': self._K, 't': token, diff --git a/yt_dlp/extractor/discovery.py b/yt_dlp/extractor/discovery.py index fd3fc8fb0f..e6e109d5c5 100644 --- a/yt_dlp/extractor/discovery.py +++ b/yt_dlp/extractor/discovery.py @@ -78,7 +78,7 @@ def _real_extract(self, url): 'Downloading token JSON metadata', query={ 'authRel': 'authorization', 'client_id': '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'nonce': ''.join(random.choices(string.ascii_letters, k=32)), 'redirectUri': 'https://www.discovery.com/', })['access_token'] diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 18363c1b91..47c316664a 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -210,7 +210,7 @@ def _real_extract(self, url): page = self._download_json( 'https://www.funimation.com/api/showexperience/%s/' % experience_id, display_id, headers=headers, expected_status=403, query={ - 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), + 'pinst_id': ''.join(random.choices(string.digits + string.ascii_letters, k=8)), }, note=f'Downloading {format_name} JSON') sources = page.get('items') or [] if not sources: diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py index a570248b7a..7bb64e17c4 100644 --- a/yt_dlp/extractor/linuxacademy.py +++ b/yt_dlp/extractor/linuxacademy.py @@ -75,9 +75,8 @@ class LinuxAcademyIE(InfoExtractor): def _perform_login(self, username, password): def random_string(): - return ''.join([ - random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') - for _ in range(32)]) + return ''.join(random.choices( + '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32)) webpage, urlh = self._download_webpage_handle( self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py index ff8bf991ef..44cae04720 100644 --- a/yt_dlp/extractor/tencent.py +++ b/yt_dlp/extractor/tencent.py @@ -32,7 +32,7 @@ def _get_ckey(self, video_id, url, guid): padding_mode='whitespace').hex().upper() def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality): - guid = ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(16)]) + guid = ''.join(random.choices(string.digits + string.ascii_lowercase, k=16)) ckey = self._get_ckey(video_id, video_url, guid) query = { 'vid': video_id, @@ -55,7 +55,7 @@ def _get_video_api_response(self, video_url, video_id, series_id, subtitle_forma 'platform': self._PLATFORM, # For VQQ 'guid': guid, - 'flowid': ''.join(random.choice(string.digits + string.ascii_lowercase) for _ in range(32)), + 'flowid': ''.join(random.choices(string.digits + string.ascii_lowercase, k=32)), } return self._search_json(r'QZOutputJson=', self._download_webpage( diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 2dd4510cc3..709d944dc6 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -49,7 +49,7 @@ def _get_sigi_state(self, webpage, display_id): def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) if webpage_cookies.get('sid_tt'): self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value) @@ -68,8 +68,8 @@ def _build_api_query(self, query, app_version, manifest_app_version): 'build_number': app_version, 'manifest_version_code': manifest_app_version, 'update_version_code': manifest_app_version, - 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)), - 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]), + 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), + 'uuid': ''.join(random.choices(string.digits, k=16)), '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', @@ -638,7 +638,7 @@ def _video_entries_api(self, webpage, user_id, username): 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } for page in itertools.count(1): @@ -686,7 +686,7 @@ def _entries(self, list_id, display_id): 'cursor': 0, 'count': 20, 'type': 5, - 'device_id': ''.join(random.choice(string.digits) for i in range(19)) + 'device_id': ''.join(random.choices(string.digits, k=19)) } for page in itertools.count(1): diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 52fa8fcec2..59ae933b08 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -119,7 +119,7 @@ def _real_extract(self, url): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] query = parse_qs(player_url) - random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + random_seed = ''.join(random.choices(string.ascii_letters + string.digits, k=8)) query['_s'] = random_seed query['_t'] = result[:16] diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 19d48234e4..dd4cad7ba8 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -251,7 +251,7 @@ def _login(self, country_code, video_id): return self._user_token def _get_token(self, country_code, video_id): - rand = ''.join(random.choice('0123456789') for _ in range(10)) + rand = ''.join(random.choices('0123456789', k=10)) return self._download_json( f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id, headers={'Content-Type': 'application/json'}, note='Getting bearer token', diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 89fa7affc2..ad9dc568a6 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -30,7 +30,7 @@ def _call_api(self, path, video_id, note, data=None): base_url = self._API_DOMAIN + '/core/' + path query = [ ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), - ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_nonce', ''.join(random.choices(string.ascii_letters, k=32))), ('oauth_signature_method', 'HMAC-SHA1'), ('oauth_timestamp', int(time.time())), ] diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index ab59200d79..404f196f46 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -129,8 +129,8 @@ class YoukuIE(InfoExtractor): @staticmethod def get_ysuid(): - return '%d%s' % (int(time.time()), ''.join([ - random.choice(string.ascii_letters) for i in range(3)])) + return '%d%s' % (int(time.time()), ''.join( + random.choices(string.ascii_letters, k=3))) def get_format_name(self, fm): _dict = { From 4455918e7f090ace0b0c2537bbfd364956eb66cb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 10:12:13 +0530 Subject: [PATCH 077/153] [extractor/stv] Detect DRM Closes #5320 --- yt_dlp/extractor/stv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index c879fb52eb..8b3e63538c 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -73,6 +73,8 @@ def _real_extract(self, url): }) programme = result.get('programme') or {} + if programme.get('drmEnabled'): + self.report_drm(video_id) return { '_type': 'url_transparent', From 119e40ef64b25f66a39246e87ce6c143cd34276d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 11:15:41 +0530 Subject: [PATCH 078/153] Add pre-processor stage `video` Related: #456, #5808 --- README.md | 44 +++++++++++++++++++------------------ yt_dlp/YoutubeDL.py | 17 +++++++++------ yt_dlp/options.py | 53 +++++++++++++++++++++------------------------ yt_dlp/utils.py | 2 +- 4 files changed, 59 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 440ed19348..d31fedb00e 100644 --- a/README.md +++ b/README.md @@ -725,7 +725,7 @@ ## Verbosity and Simulation Options: screen, optionally prefixed with when to print it, separated by a ":". Supported values of "WHEN" are the same as that of - --use-postprocessor, and "video" (default). + --use-postprocessor (default: video). Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. This option can be used multiple times @@ -979,18 +979,18 @@ ## Post-Processing Options: --ffmpeg-location PATH Location of the ffmpeg binary; either the path to the binary or its containing directory --exec [WHEN:]CMD Execute a command, optionally prefixed with - when to execute it (after_move if - unspecified), separated by a ":". Supported - values of "WHEN" are the same as that of - --use-postprocessor. Same syntax as the - output template can be used to pass any - field as arguments to the command. After - download, an additional field "filepath" - that contains the final path of the - downloaded file is also available, and if no - fields are passed, %(filepath)q is appended - to the end of the command. This option can - be used multiple times + when to execute it, separated by a ":". + Supported values of "WHEN" are the same as + that of --use-postprocessor (default: + after_move). Same syntax as the output + template can be used to pass any field as + arguments to the command. After download, an + additional field "filepath" that contains + the final path of the downloaded file is + also available, and if no fields are passed, + %(filepath)q is appended to the end of the + command. This option can be used multiple + times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format (currently supported: ass, lrc, srt, vtt) @@ -1028,14 +1028,16 @@ ## Post-Processing Options: postprocessor is invoked. It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), - "before_dl" (before each video download), - "post_process" (after each video download; - default), "after_move" (after moving video - file to it's final locations), "after_video" - (after downloading and processing all - formats of a video), or "playlist" (at end - of playlist). This option can be used - multiple times to add different postprocessors + "video" (after --format; before + --print/--output), "before_dl" (before each + video download), "post_process" (after each + video download; default), "after_move" + (after moving video file to it's final + locations), "after_video" (after downloading + and processing all formats of a video), or + "playlist" (at end of playlist). This option + can be used multiple times to add different + postprocessors ## SponsorBlock Options: Make chapter entries for, or remove various segments (sponsor, diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 17f37a6432..5057323274 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2977,6 +2977,16 @@ def process_info(self, info_dict): # Does nothing under normal operation - for backward compatibility of process_info self.post_extract(info_dict) + + def replace_info_dict(new_info): + nonlocal info_dict + if new_info == info_dict: + return + info_dict.clear() + info_dict.update(new_info) + + new_info, _ = self.pre_process(info_dict, 'video') + replace_info_dict(new_info) self._num_downloads += 1 # info_dict['_filename'] needs to be set for backward compatibility @@ -3090,13 +3100,6 @@ def _write_link_file(link_type): for link_type, should_write in write_links.items()): return - def replace_info_dict(new_info): - nonlocal info_dict - if new_info == info_dict: - return - info_dict.clear() - info_dict.update(new_info) - new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) replace_info_dict(new_info) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index bc574b8857..096a502491 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -277,6 +277,20 @@ def _dict_from_options_callback( out_dict[key] = out_dict.get(key, []) + [val] if append else val setattr(parser.values, option.dest, out_dict) + def when_prefix(default): + return { + 'default': {}, + 'type': 'str', + 'action': 'callback', + 'callback': _dict_from_options_callback, + 'callback_kwargs': { + 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)), + 'default_key': default, + 'multiple_keys': False, + 'append': True, + }, + } + parser = _YoutubeDLOptionParser() alias_group = optparse.OptionGroup(parser, 'Aliases') Formatter = string.Formatter() @@ -1086,28 +1100,16 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Do not download the video but write all related files (Alias: --no-download)') verbosity.add_option( '-O', '--print', - metavar='[WHEN:]TEMPLATE', dest='forceprint', default={}, type='str', - action='callback', callback=_dict_from_options_callback, - callback_kwargs={ - 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)), - 'default_key': 'video', - 'multiple_keys': False, - 'append': True, - }, help=( + metavar='[WHEN:]TEMPLATE', dest='forceprint', **when_prefix('video'), + help=( 'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". ' - 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: video). ' 'Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. ' 'This option can be used multiple times')) verbosity.add_option( '--print-to-file', - metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2, - action='callback', callback=_dict_from_options_callback, - callback_kwargs={ - 'allowed_keys': 'video|' + '|'.join(map(re.escape, POSTPROCESS_WHEN)), - 'default_key': 'video', - 'multiple_keys': False, - 'append': True, - }, help=( + metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', nargs=2, **when_prefix('video'), + help=( 'Append given template to the file. The values of WHEN and TEMPLATE are same as that of --print. ' 'FILE uses the same syntax as the output template. This option can be used multiple times')) verbosity.add_option( @@ -1629,16 +1631,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help='Location of the ffmpeg binary; either the path to the binary or its containing directory') postproc.add_option( '--exec', - metavar='[WHEN:]CMD', dest='exec_cmd', default={}, type='str', - action='callback', callback=_dict_from_options_callback, - callback_kwargs={ - 'allowed_keys': '|'.join(map(re.escape, POSTPROCESS_WHEN)), - 'default_key': 'after_move', - 'multiple_keys': False, - 'append': True, - }, help=( - 'Execute a command, optionally prefixed with when to execute it (after_move if unspecified), separated by a ":". ' - 'Supported values of "WHEN" are the same as that of --use-postprocessor. ' + metavar='[WHEN:]CMD', dest='exec_cmd', **when_prefix('after_move'), + help=( + 'Execute a command, optionally prefixed with when to execute it, separated by a ":". ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: after_move). ' 'Same syntax as the output template can be used to pass any field as arguments to the command. ' 'After download, an additional field "filepath" that contains the final path of the downloaded file ' 'is also available, and if no fields are passed, %(filepath)q is appended to the end of the command. ' @@ -1714,7 +1710,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'ARGS are a semicolon ";" delimited list of NAME=VALUE. ' 'The "when" argument determines when the postprocessor is invoked. ' 'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), ' - '"before_dl" (before each video download), "post_process" (after each video download; default), ' + '"video" (after --format; before --print/--output), "before_dl" (before each video download), ' + '"post_process" (after each video download; default), ' '"after_move" (after moving video file to it\'s final locations), ' '"after_video" (after downloading and processing all formats of a video), ' 'or "playlist" (at end of playlist). ' diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 3947dcf2e5..43b5fda1d2 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3395,7 +3395,7 @@ def q(qid): return q -POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') +POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') DEFAULT_OUTTMPL = { From fe74d5b592438c669f5717b34504f27c34ca9904 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 11:01:14 +0530 Subject: [PATCH 079/153] Let `--parse/replace-in-metadata` run at any post-processing stage Closes #5808, #456 --- README.md | 13 +++++++++---- yt_dlp/__init__.py | 14 ++++++++------ yt_dlp/options.py | 12 +++++++----- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d31fedb00e..500f92387b 100644 --- a/README.md +++ b/README.md @@ -952,13 +952,18 @@ ## Post-Processing Options: mkv/mka video files --no-embed-info-json Do not embed the infojson as an attachment to the video file - --parse-metadata FROM:TO Parse additional metadata like title/artist + --parse-metadata [WHEN:]FROM:TO + Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" - for details - --replace-in-metadata FIELDS REGEX REPLACE + for details. Supported values of "WHEN" are + the same as that of --use-postprocessor + (default: pre_process) + --replace-in-metadata [WHEN:]FIELDS REGEX REPLACE Replace text in a metadata field using the given regex. This option can be used - multiple times + multiple times. Supported values of "WHEN" + are the same as that of --use-postprocessor + (default: pre_process) --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --concat-playlist POLICY Concatenate videos in a playlist. One of diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 202f102ba9..3490816c4c 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -386,10 +386,12 @@ def metadataparser_actions(f): raise ValueError(f'{cmd} is invalid; {err}') yield action - parse_metadata = opts.parse_metadata or [] if opts.metafromtitle is not None: - parse_metadata.append('title:%s' % opts.metafromtitle) - opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata))) + opts.parse_metadata.setdefault('pre_process', []).append('title:%s' % opts.metafromtitle) + opts.parse_metadata = { + k: list(itertools.chain(*map(metadataparser_actions, v))) + for k, v in opts.parse_metadata.items() + } # Other options if opts.playlist_items is not None: @@ -561,11 +563,11 @@ def report_deprecation(val, old, new=None): def get_postprocessors(opts): yield from opts.add_postprocessors - if opts.parse_metadata: + for when, actions in opts.parse_metadata.items(): yield { 'key': 'MetadataParser', - 'actions': opts.parse_metadata, - 'when': 'pre_process' + 'actions': actions, + 'when': when } sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove if sponsorblock_query: diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 096a502491..ed83cb763e 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1586,14 +1586,16 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=optparse.SUPPRESS_HELP) postproc.add_option( '--parse-metadata', - metavar='FROM:TO', dest='parse_metadata', action='append', + metavar='[WHEN:]FROM:TO', dest='parse_metadata', **when_prefix('pre_process'), help=( - 'Parse additional metadata like title/artist from other fields; ' - 'see "MODIFYING METADATA" for details')) + 'Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" for details. ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: pre_process)')) postproc.add_option( '--replace-in-metadata', - dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3, - help='Replace text in a metadata field using the given regex. This option can be used multiple times') + dest='parse_metadata', metavar='[WHEN:]FIELDS REGEX REPLACE', nargs=3, **when_prefix('pre_process'), + help=( + 'Replace text in a metadata field using the given regex. This option can be used multiple times. ' + 'Supported values of "WHEN" are the same as that of --use-postprocessor (default: pre_process)')) postproc.add_option( '--xattrs', '--xattr', action='store_true', dest='xattrs', default=False, From d5f043d127cac1e8ec8a6eacde04ad1133600a16 Mon Sep 17 00:00:00 2001 From: ChillingPepper <90042155+ChillingPepper@users.noreply.github.com> Date: Fri, 30 Dec 2022 07:38:38 +0100 Subject: [PATCH 080/153] [utils] js_to_json: Fix bug in f55523c (#5771) Authored by: ChillingPepper, pukkandan --- test/test_utils.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 8 ++++- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 49ab3796b9..82ae77ea25 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -954,6 +954,85 @@ def test_escape_url(self): ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') + def test_js_to_json_vars_strings(self): + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'null': a, + 'nullStr': b, + 'true': c, + 'trueStr': d, + 'false': e, + 'falseStr': f, + 'unresolvedVar': g, + }''', + { + 'a': 'null', + 'b': '"null"', + 'c': 'true', + 'd': '"true"', + 'e': 'false', + 'f': '"false"', + 'g': 'var', + } + )), + { + 'null': None, + 'nullStr': 'null', + 'true': True, + 'trueStr': 'true', + 'false': False, + 'falseStr': 'false', + 'unresolvedVar': 'var' + } + ) + + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'int': a, + 'intStr': b, + 'float': c, + 'floatStr': d, + }''', + { + 'a': '123', + 'b': '"123"', + 'c': '1.23', + 'd': '"1.23"', + } + )), + { + 'int': 123, + 'intStr': '123', + 'float': 1.23, + 'floatStr': '1.23', + } + ) + + self.assertDictEqual( + json.loads(js_to_json( + '''{ + 'object': a, + 'objectStr': b, + 'array': c, + 'arrayStr': d, + }''', + { + 'a': '{}', + 'b': '"{}"', + 'c': '[]', + 'd': '"[]"', + } + )), + { + 'object': {}, + 'objectStr': '{}', + 'array': [], + 'arrayStr': '[]', + } + ) + def test_js_to_json_realworld(self): inp = '''{ 'clip':{'provider':'pseudo'} diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 43b5fda1d2..64c83a77a2 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3360,7 +3360,13 @@ def fix_kv(m): return f'"{i}":' if v.endswith(':') else str(i) if v in vars: - return json.dumps(vars[v]) + try: + if not strict: + json.loads(vars[v]) + except json.decoder.JSONDecodeError: + return json.dumps(vars[v]) + else: + return vars[v] if not strict: return f'"{v}"' From f74371a97d67237e055612006602934b910b1275 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 11:57:33 +0530 Subject: [PATCH 081/153] [extractor/bilibili] Fix `--no-playlist` for anthology Closes #5797 --- yt_dlp/extractor/bilibili.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 37711c138a..92620f697b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -303,7 +303,8 @@ def _real_extract(self, url): getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: - title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' + part_id = part_id or 1 + title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') From ec54bd43f374cee429d67078ac61b75e66afb3fa Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 14:07:11 +0530 Subject: [PATCH 082/153] Fix bug in writing playlist info-json Closes #4889 --- yt_dlp/YoutubeDL.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 5057323274..db6bfded83 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1862,11 +1862,10 @@ def __process_playlist(self, ie_result, download): self.to_screen('[download] Downloading item %s of %s' % ( self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) - extra.update({ + entry_result = self.__process_iterable_entry(entry, download, collections.ChainMap({ 'playlist_index': playlist_index, 'playlist_autonumber': i + 1, - }) - entry_result = self.__process_iterable_entry(entry, download, extra) + }, extra)) if not entry_result: failures += 1 if failures >= max_failures: From fbb73833067ba742459729809679a62f34b3e41e Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 15:30:56 +0530 Subject: [PATCH 083/153] Add `weba` to known extensions --- test/test_utils.py | 2 ++ yt_dlp/utils.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 82ae77ea25..3d5a6ea6ba 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1953,6 +1953,8 @@ def test_get_compatible_ext(self): vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['m4a']), 'mkv') self.assertEqual(get_compatible_ext( vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['webm']), 'webm') + self.assertEqual(get_compatible_ext( + vcodecs=[None], acodecs=[None], vexts=['webm'], aexts=['weba']), 'webm') self.assertEqual(get_compatible_ext( vcodecs=['h264'], acodecs=['mp4a'], vexts=['mov'], aexts=['m4a']), 'mp4') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 64c83a77a2..ee5340cd26 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3656,7 +3656,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): COMPATIBLE_EXTS = ( {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'}, - {'webm'}, + {'webm', 'weba'}, ) for ext in preferences or vexts: current_exts = {ext, *vexts, *aexts} @@ -5962,7 +5962,7 @@ def items_(self): common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), - audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'), + audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), thumbnails=('jpg', 'png', 'webp'), storyboards=('mhtml', ), subtitles=('srt', 'vtt', 'ass', 'lrc'), @@ -6094,9 +6094,9 @@ class FormatSorter: 'vext': {'type': 'ordered', 'field': 'video_ext', 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'), 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')}, - 'aext': {'type': 'ordered', 'field': 'audio_ext', - 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), - 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')}, + 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext', + 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'), + 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')}, 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', 'field': ('vcodec', 'acodec'), From 9bb856998b0d5a0ad58268f0ba8d784fb9d934e3 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 30 Dec 2022 15:32:33 +0530 Subject: [PATCH 084/153] [extractor/youtube] Extract DRC formats --- yt_dlp/extractor/youtube.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 9dde34fb01..506bd1e19a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2544,6 +2544,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': [], }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, + }, { + 'note': 'Audio formats with Dynamic Range Compression', + 'url': 'https://www.youtube.com/watch?v=Tq92D6wQ1mg', + 'info_dict': { + 'id': 'Tq92D6wQ1mg', + 'ext': 'weba', + 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', + 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_follower_count': int, + 'description': 'md5:17eccca93a786d51bc67646756894066', + 'upload_date': '20191228', + 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'], + 'playable_in_embed': True, + 'like_count': int, + 'categories': ['Entertainment'], + 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', + 'age_limit': 18, + 'channel': 'Projekt Melody', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'view_count': int, + 'availability': 'needs_auth', + 'comment_count': int, + 'live_status': 'not_live', + 'uploader': 'Projekt Melody', + 'duration': 106, + }, + 'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'}, } ] @@ -3553,7 +3582,7 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l itag = str_or_none(fmt.get('itag')) audio_track = fmt.get('audioTrack') or {} - stream_id = '%s.%s' % (itag or '', audio_track.get('id', '')) + stream_id = (itag, audio_track.get('id'), fmt.get('isDrc')) if stream_id in stream_ids: continue @@ -3634,11 +3663,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), - 'format_id': itag, + 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + 'DRC' if fmt.get('isDrc') else None, try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), @@ -3647,7 +3677,7 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l 'fps': int_or_none(fmt.get('fps')) or None, 'audio_channels': fmt.get('audioChannels'), 'height': height, - 'quality': q(quality), + 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, 'url': fmt_url, From 8d1ddb0805c7c56bd03a5c0837c55602473d213f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 31 Dec 2022 09:45:12 +0530 Subject: [PATCH 085/153] [extractor/udemy] Fix lectures that have no URL and detect DRM Closes #5662 --- yt_dlp/extractor/udemy.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/udemy.py b/yt_dlp/extractor/udemy.py index 8b99c59cf5..329e5da2d9 100644 --- a/yt_dlp/extractor/udemy.py +++ b/yt_dlp/extractor/udemy.py @@ -11,8 +11,10 @@ int_or_none, js_to_json, sanitized_Request, + smuggle_url, try_get, unescapeHTML, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -106,7 +108,7 @@ def _download_lecture(self, course_id, lecture_id): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data,course_is_drmed', }) def _handle_error(self, response): @@ -199,16 +201,19 @@ def is_logged(webpage): def _real_extract(self, url): lecture_id = self._match_id(url) + course_id = unsmuggle_url(url, {})[1].get('course_id') - webpage = self._download_webpage(url, lecture_id) - - course_id, _ = self._extract_course_info(webpage, lecture_id) + webpage = None + if not course_id: + webpage = self._download_webpage(url, lecture_id) + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + webpage = webpage or self._download_webpage(url, lecture_id) self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: @@ -391,6 +396,9 @@ def extract_subtitles(track_list): if f.get('url'): formats.append(f) + if not formats and asset.get('course_is_drmed'): + self.report_drm(video_id) + return { 'id': video_id, 'title': title, @@ -449,7 +457,9 @@ def _real_extract(self, url): if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), + 'url': smuggle_url( + f'https://www.udemy.com/{course_path}/learn/v4/t/lecture/{entry["id"]}', + {'course_id': course_id}), 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } From a0e526ed4d042c88771cd5669ceb4413d2b8c47f Mon Sep 17 00:00:00 2001 From: Stel Abrego Date: Fri, 30 Dec 2022 20:58:33 -0800 Subject: [PATCH 086/153] [extractor/bandcamp] Add `album_artist` (#5537) Closes #5536 Authored by: stelcodes --- yt_dlp/extractor/bandcamp.py | 48 +++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index de81e0de7b..e89b3a69b3 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -29,11 +29,18 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", + 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭', 'duration': 9.8485, - 'uploader': 'youtube-dl "\'/\\ä↭', + 'uploader': 'youtube-dl "\'/\\ä↭', 'upload_date': '20121129', 'timestamp': 1354224127, + 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭', + 'album_artist': 'youtube-dl "\'/\\ä↭', + 'track_id': '1812978515', + 'artist': 'youtube-dl "\'/\\ä↭', + 'uploader_url': 'https://youtube-dl.bandcamp.com', + 'uploader_id': 'youtube-dl', + 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { @@ -41,7 +48,8 @@ class BandcampIE(InfoExtractor): 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'info_dict': { 'id': '2650410135', - 'ext': 'aiff', + 'ext': 'm4a', + 'acodec': r're:[fa]lac', 'title': 'Ben Prunty - Lanius (Battle)', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ben Prunty', @@ -54,7 +62,10 @@ class BandcampIE(InfoExtractor): 'track_number': 1, 'track_id': '2650410135', 'artist': 'Ben Prunty', + 'album_artist': 'Ben Prunty', 'album': 'FTL: Advanced Edition Soundtrack', + 'uploader_url': 'https://benprunty.bandcamp.com', + 'uploader_id': 'benprunty', }, }, { # no free download, mp3 128 @@ -75,7 +86,34 @@ class BandcampIE(InfoExtractor): 'track_number': 5, 'track_id': '2584466013', 'artist': 'Mastodon', + 'album_artist': 'Mastodon', 'album': 'Call of the Mastodon', + 'uploader_url': 'https://relapsealumni.bandcamp.com', + 'uploader_id': 'relapsealumni', + }, + }, { + # track from compilation album (artist/album_artist difference) + 'url': 'https://diskotopia.bandcamp.com/track/safehouse', + 'md5': '19c5337bca1428afa54129f86a2f6a69', + 'info_dict': { + 'id': '1978174799', + 'ext': 'mp3', + 'title': 'submerse - submerse - Safehouse', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'submerse', + 'timestamp': 1480779297, + 'upload_date': '20161203', + 'release_timestamp': 1481068800, + 'release_date': '20161207', + 'duration': 154.066, + 'track': 'submerse - Safehouse', + 'track_number': 3, + 'track_id': '1978174799', + 'artist': 'submerse', + 'album_artist': 'Diskotopia', + 'album': 'DSK F/W 2016-2017 Free Compilation', + 'uploader_url': 'https://diskotopia.bandcamp.com', + 'uploader_id': 'diskotopia', }, }] @@ -121,6 +159,9 @@ def _real_extract(self, url): embed = self._extract_data_attr(webpage, title, 'embed', False) current = tralbum.get('current') or {} artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') + album_artist = self._html_search_regex( + r'

[\S\s]*?by\s*\s*\s*([^>]+?)\s*', + webpage, 'album artist', fatal=False) timestamp = unified_timestamp( current.get('publish_date') or tralbum.get('album_publish_date')) @@ -205,6 +246,7 @@ def _real_extract(self, url): 'track_id': track_id, 'artist': artist, 'album': embed.get('album_title'), + 'album_artist': album_artist, 'formats': formats, } From 2fb0f858686c46abc50a0e253245afe750746775 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 31 Dec 2022 11:02:24 +0530 Subject: [PATCH 087/153] [update] Workaround #5632 --- yt_dlp/update.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index ac3e28057d..a3a731aef5 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -15,7 +15,6 @@ Popen, cached_method, deprecation_warning, - remove_end, shell_quote, system_identifier, traverse_obj, @@ -43,7 +42,8 @@ def _get_variant_and_executable_path(): # Ref: https://en.wikipedia.org/wiki/Uname#Examples if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): machine = '_x86' if platform.architecture()[0][:2] == '32' else '' - return f'{remove_end(sys.platform, "32")}{machine}_exe', path + # NB: https://github.com/yt-dlp/yt-dlp/issues/5632 + return f'{sys.platform}{machine}_exe', path path = os.path.dirname(__file__) if isinstance(__loader__, zipimporter): @@ -74,8 +74,8 @@ def current_git_head(): _FILE_SUFFIXES = { 'zip': '', 'py2exe': '_min.exe', - 'win_exe': '.exe', - 'win_x86_exe': '_x86.exe', + 'win32_exe': '.exe', + 'win32_x86_exe': '_x86.exe', 'darwin_exe': '_macos', 'darwin_legacy_exe': '_macos_legacy', 'linux_exe': '_linux', From 8e40b9d1ec132ae1bcac50b3ee520ece46ac9c55 Mon Sep 17 00:00:00 2001 From: Matthew Date: Sun, 1 Jan 2023 04:29:22 +0000 Subject: [PATCH 088/153] Improve plugin architecture (#5553) to make plugins easier to develop and use: * Plugins are now loaded as namespace packages. * Plugins can be loaded in any distribution of yt-dlp (binary, pip, source, etc.). * Plugin packages can be installed and managed via pip, or dropped into any of the documented locations. * Users do not need to edit any code files to install plugins. * Backwards-compatible with previous plugin architecture. As a side-effect, yt-dlp will now search in a few more locations for config files. Closes https://github.com/yt-dlp/yt-dlp/issues/1389 Authored by: flashdagger, coletdjnz, pukkandan, Grub4K Co-authored-by: Marcel Co-authored-by: pukkandan Co-authored-by: Simon Sawicki --- .gitignore | 8 +- README.md | 66 ++++++- devscripts/make_lazy_extractors.py | 4 + test/test_plugins.py | 73 ++++++++ .../yt_dlp_plugins/extractor/_ignore.py | 5 + .../yt_dlp_plugins/extractor/ignore.py | 12 ++ .../yt_dlp_plugins/extractor/normal.py | 9 + .../yt_dlp_plugins/postprocessor/normal.py | 5 + .../yt_dlp_plugins/extractor/zipped.py | 5 + .../yt_dlp_plugins/postprocessor/zipped.py | 5 + yt_dlp/YoutubeDL.py | 15 +- yt_dlp/extractor/extractors.py | 4 +- yt_dlp/options.py | 91 +++++----- yt_dlp/plugins.py | 171 ++++++++++++++++++ yt_dlp/postprocessor/__init__.py | 5 +- yt_dlp/utils.py | 55 ++++-- ytdlp_plugins/extractor/__init__.py | 4 - ytdlp_plugins/extractor/sample.py | 14 -- ytdlp_plugins/postprocessor/__init__.py | 4 - ytdlp_plugins/postprocessor/sample.py | 26 --- 20 files changed, 455 insertions(+), 126 deletions(-) create mode 100644 test/test_plugins.py create mode 100644 test/testdata/yt_dlp_plugins/extractor/_ignore.py create mode 100644 test/testdata/yt_dlp_plugins/extractor/ignore.py create mode 100644 test/testdata/yt_dlp_plugins/extractor/normal.py create mode 100644 test/testdata/yt_dlp_plugins/postprocessor/normal.py create mode 100644 test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py create mode 100644 test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py create mode 100644 yt_dlp/plugins.py delete mode 100644 ytdlp_plugins/extractor/__init__.py delete mode 100644 ytdlp_plugins/extractor/sample.py delete mode 100644 ytdlp_plugins/postprocessor/__init__.py delete mode 100644 ytdlp_plugins/postprocessor/sample.py diff --git a/.gitignore b/.gitignore index 00d74057fa..ef4d116167 100644 --- a/.gitignore +++ b/.gitignore @@ -120,9 +120,5 @@ yt-dlp.zip */extractor/lazy_extractors.py # Plugins -ytdlp_plugins/extractor/* -!ytdlp_plugins/extractor/__init__.py -!ytdlp_plugins/extractor/sample.py -ytdlp_plugins/postprocessor/* -!ytdlp_plugins/postprocessor/__init__.py -!ytdlp_plugins/postprocessor/sample.py +ytdlp_plugins/* +yt-dlp-plugins/* diff --git a/README.md b/README.md index 500f92387b..4294090dc5 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ * [Modifying metadata examples](#modifying-metadata-examples) * [EXTRACTOR ARGUMENTS](#extractor-arguments) * [PLUGINS](#plugins) + * [Installing Plugins](#installing-plugins) + * [Developing Plugins](#developing-plugins) * [EMBEDDING YT-DLP](#embedding-yt-dlp) * [Embedding examples](#embedding-examples) * [DEPRECATED OPTIONS](#deprecated-options) @@ -1110,15 +1112,20 @@ # CONFIGURATION * If `-P` is not given, the current directory is searched 1. **User Configuration**: * `${XDG_CONFIG_HOME}/yt-dlp/config` (recommended on Linux/macOS) + * `${XDG_CONFIG_HOME}/yt-dlp/config.txt` * `${XDG_CONFIG_HOME}/yt-dlp.conf` * `${APPDATA}/yt-dlp/config` (recommended on Windows) * `${APPDATA}/yt-dlp/config.txt` * `~/yt-dlp.conf` * `~/yt-dlp.conf.txt` + * `~/.yt-dlp/config` + * `~/.yt-dlp/config.txt` See also: [Notes about environment variables](#notes-about-environment-variables) 1. **System Configuration**: * `/etc/yt-dlp.conf` + * `/etc/yt-dlp/config` + * `/etc/yt-dlp/config.txt` E.g. with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: ``` @@ -1789,19 +1796,68 @@ #### twitter # PLUGINS -Plugins are loaded from `/ytdlp_plugins//__init__.py`; where `` is the directory of the binary (`/yt-dlp`), or the root directory of the module if you are running directly from source-code (`/yt_dlp/__main__.py`). Plugins are currently not supported for the `pip` version +Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. **Use plugins at your own risk and only if you trust the code!** -Plugins can be of ``s `extractor` or `postprocessor`. Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. Postprocessor plugins can be invoked using `--use-postprocessor NAME`. +Plugins can be of ``s `extractor` or `postprocessor`. +- Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. +- Extractor plugins take priority over builtin extractors. +- Postprocessor plugins can be invoked using `--use-postprocessor NAME`. -See [ytdlp_plugins](ytdlp_plugins) for example plugins. -Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. Use plugins at your own risk and only if you trust the code +Plugins are loaded from the namespace packages `yt_dlp_plugins.extractor` and `yt_dlp_plugins.postprocessor`. -If you are a plugin author, add [ytdlp-plugins](https://github.com/topics/ytdlp-plugins) as a topic to your repository for discoverability +In other words, the file structure on the disk looks something like: + + yt_dlp_plugins/ + extractor/ + myplugin.py + postprocessor/ + myplugin.py + +yt-dlp looks for these `yt_dlp_plugins` namespace folders in many locations (see below) and loads in plugins from **all** of them. See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins) +## Installing Plugins +Plugins can be installed using various methods and locations. + +1. **Configuration directories**: + Plugin packages (containing a `yt_dlp_plugins` namespace folder) can be dropped into the following standard [configuration locations](#configuration): + * **User Plugins** + * `${XDG_CONFIG_HOME}/yt-dlp/plugins//yt_dlp_plugins/` (recommended on Linux/macOS) + * `${XDG_CONFIG_HOME}/yt-dlp-plugins//yt_dlp_plugins/` + * `${APPDATA}/yt-dlp/plugins//yt_dlp_plugins/` (recommended on Windows) + * `~/.yt-dlp/plugins//yt_dlp_plugins/` + * `~/yt-dlp-plugins//yt_dlp_plugins/` + * **System Plugins** + * `/etc/yt-dlp/plugins//yt_dlp_plugins/` + * `/etc/yt-dlp-plugins//yt_dlp_plugins/` +2. **Executable location**: Plugin packages can similarly be installed in a `yt-dlp-plugins` directory under the executable location: + * Binary: where `/yt-dlp.exe`, `/yt-dlp-plugins//yt_dlp_plugins/` + * Source: where `/yt_dlp/__main__.py`, `/yt-dlp-plugins//yt_dlp_plugins/` + +3. **pip and other locations in `PYTHONPATH`** + * Plugin packages can be installed and managed using `pip`. See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. + * Note: plugin files between plugin packages installed with pip must have unique filenames + * Any path in `PYTHONPATH` is searched in for the `yt_dlp_plugins` namespace folder. + * Note: This does not apply for Pyinstaller/py2exe builds. + + +.zip, .egg and .whl archives containing a `yt_dlp_plugins` namespace folder in their root are also supported. These can be placed in the same locations `yt_dlp_plugins` namespace folders can be found. +- e.g. `${XDG_CONFIG_HOME}/yt-dlp/plugins/mypluginpkg.zip` where `mypluginpkg.zip` contains `yt_dlp_plugins//myplugin.py` + +Run yt-dlp with `--verbose`/`-v` to check if the plugin has been loaded. + +## Developing Plugins + +See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for a sample plugin package with instructions on how to set up an environment for plugin development. + +All public classes with a name ending in `IE` are imported from each file. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`) + +If you are a plugin author, add [yt-dlp-plugins](https://github.com/topics/yt-dlp-plugins) as a topic to your repository for discoverability + +See the [Developer Instructions](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) on how to write and test an extractor. # EMBEDDING YT-DLP diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index c502bdf896..d74ea202f0 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -40,8 +40,12 @@ def main(): _ALL_CLASSES = get_all_ies() # Must be before import + import yt_dlp.plugins from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor + # Filter out plugins + _ALL_CLASSES = [cls for cls in _ALL_CLASSES if not cls.__module__.startswith(f'{yt_dlp.plugins.PACKAGE_NAME}.')] + DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( MODULE_TEMPLATE, diff --git a/test/test_plugins.py b/test/test_plugins.py new file mode 100644 index 0000000000..6cde579e1e --- /dev/null +++ b/test/test_plugins.py @@ -0,0 +1,73 @@ +import importlib +import os +import shutil +import sys +import unittest +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +TEST_DATA_DIR = Path(os.path.dirname(os.path.abspath(__file__)), 'testdata') +sys.path.append(str(TEST_DATA_DIR)) +importlib.invalidate_caches() + +from yt_dlp.plugins import PACKAGE_NAME, directories, load_plugins + + +class TestPlugins(unittest.TestCase): + + TEST_PLUGIN_DIR = TEST_DATA_DIR / PACKAGE_NAME + + def test_directories_containing_plugins(self): + self.assertIn(self.TEST_PLUGIN_DIR, map(Path, directories())) + + def test_extractor_classes(self): + for module_name in tuple(sys.modules): + if module_name.startswith(f'{PACKAGE_NAME}.extractor'): + del sys.modules[module_name] + plugins_ie = load_plugins('extractor', 'IE') + + self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertIn('NormalPluginIE', plugins_ie.keys()) + + # don't load modules with underscore prefix + self.assertFalse( + f'{PACKAGE_NAME}.extractor._ignore' in sys.modules.keys(), + 'loaded module beginning with underscore') + self.assertNotIn('IgnorePluginIE', plugins_ie.keys()) + + # Don't load extractors with underscore prefix + self.assertNotIn('_IgnoreUnderscorePluginIE', plugins_ie.keys()) + + # Don't load extractors not specified in __all__ (if supplied) + self.assertNotIn('IgnoreNotInAllPluginIE', plugins_ie.keys()) + self.assertIn('InAllPluginIE', plugins_ie.keys()) + + def test_postprocessor_classes(self): + plugins_pp = load_plugins('postprocessor', 'PP') + self.assertIn('NormalPluginPP', plugins_pp.keys()) + + def test_importing_zipped_module(self): + zip_path = TEST_DATA_DIR / 'zipped_plugins.zip' + shutil.make_archive(str(zip_path)[:-4], 'zip', str(zip_path)[:-4]) + sys.path.append(str(zip_path)) # add zip to search paths + importlib.invalidate_caches() # reset the import caches + + try: + for plugin_type in ('extractor', 'postprocessor'): + package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') + self.assertIn(zip_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) + + plugins_ie = load_plugins('extractor', 'IE') + self.assertIn('ZippedPluginIE', plugins_ie.keys()) + + plugins_pp = load_plugins('postprocessor', 'PP') + self.assertIn('ZippedPluginPP', plugins_pp.keys()) + + finally: + sys.path.remove(str(zip_path)) + os.remove(zip_path) + importlib.invalidate_caches() # reset the import caches + + +if __name__ == '__main__': + unittest.main() diff --git a/test/testdata/yt_dlp_plugins/extractor/_ignore.py b/test/testdata/yt_dlp_plugins/extractor/_ignore.py new file mode 100644 index 0000000000..57faf75bbc --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/_ignore.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class IgnorePluginIE(InfoExtractor): + pass diff --git a/test/testdata/yt_dlp_plugins/extractor/ignore.py b/test/testdata/yt_dlp_plugins/extractor/ignore.py new file mode 100644 index 0000000000..816a16aa20 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/ignore.py @@ -0,0 +1,12 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class IgnoreNotInAllPluginIE(InfoExtractor): + pass + + +class InAllPluginIE(InfoExtractor): + pass + + +__all__ = ['InAllPluginIE'] diff --git a/test/testdata/yt_dlp_plugins/extractor/normal.py b/test/testdata/yt_dlp_plugins/extractor/normal.py new file mode 100644 index 0000000000..b09009bdc6 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/normal.py @@ -0,0 +1,9 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class NormalPluginIE(InfoExtractor): + pass + + +class _IgnoreUnderscorePluginIE(InfoExtractor): + pass diff --git a/test/testdata/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/yt_dlp_plugins/postprocessor/normal.py new file mode 100644 index 0000000000..315b85a488 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/postprocessor/normal.py @@ -0,0 +1,5 @@ +from yt_dlp.postprocessor.common import PostProcessor + + +class NormalPluginPP(PostProcessor): + pass diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py new file mode 100644 index 0000000000..01542e0d8d --- /dev/null +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class ZippedPluginIE(InfoExtractor): + pass diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py new file mode 100644 index 0000000000..223822bd6f --- /dev/null +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/postprocessor/zipped.py @@ -0,0 +1,5 @@ +from yt_dlp.postprocessor.common import PostProcessor + + +class ZippedPluginPP(PostProcessor): + pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index db6bfded83..9ef56a46b6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -32,6 +32,7 @@ from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper from .minicurses import format_text +from .plugins import directories as plugin_directories from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors from .postprocessor import ( EmbedThumbnailPP, @@ -3773,10 +3774,6 @@ def get_encoding(stream): write_debug('Lazy loading extractors is forcibly disabled') else: write_debug('Lazy loading extractors is disabled') - if plugin_extractors or plugin_postprocessors: - write_debug('Plugins: %s' % [ - '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') - for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) if self.params['compat_opts']: write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) @@ -3810,6 +3807,16 @@ def get_encoding(stream): proxy_map.update(handler.proxies) write_debug(f'Proxy map: {proxy_map}') + for plugin_type, plugins in {'Extractor': plugin_extractors, 'Post-Processor': plugin_postprocessors}.items(): + if not plugins: + continue + write_debug(f'{plugin_type} Plugins: %s' % (', '.join(sorted(('%s%s' % ( + klass.__name__, '' if klass.__name__ == name else f' as {name}') + for name, klass in plugins.items()))))) + plugin_dirs = plugin_directories() + if plugin_dirs: + write_debug(f'Plugin directories: {plugin_dirs}') + # Not implemented if False and self.params.get('call_home'): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 610e02f906..beda02917e 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1,10 +1,10 @@ import contextlib import os -from ..utils import load_plugins +from ..plugins import load_plugins # NB: Must be before other imports so that plugins can be correctly injected -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) +_PLUGIN_CLASSES = load_plugins('extractor', 'IE') _LAZY_LOADER = False if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): diff --git a/yt_dlp/options.py b/yt_dlp/options.py index ed83cb763e..be4695cbb5 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -29,6 +29,8 @@ expand_path, format_field, get_executable_path, + get_system_config_dirs, + get_user_config_dirs, join_nonempty, orderedSet_from_options, remove_end, @@ -42,62 +44,67 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): if ignore_config_files == 'if_override': ignore_config_files = overrideArguments is not None - def _readUserConf(package_name, default=[]): - # .config + def _load_from_config_dirs(config_dirs): + for config_dir in config_dirs: + conf_file_path = os.path.join(config_dir, 'config') + conf = Config.read_file(conf_file_path, default=None) + if conf is None: + conf_file_path += '.txt' + conf = Config.read_file(conf_file_path, default=None) + if conf is not None: + return conf, conf_file_path + return None, None + + def _read_user_conf(package_name, default=None): + # .config/package_name.conf xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') - userConfFile = os.path.join(xdg_config_home, package_name, 'config') - if not os.path.isfile(userConfFile): - userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name) - userConf = Config.read_file(userConfFile, default=None) - if userConf is not None: - return userConf, userConfFile + user_conf_file = os.path.join(xdg_config_home, '%s.conf' % package_name) + user_conf = Config.read_file(user_conf_file, default=None) + if user_conf is not None: + return user_conf, user_conf_file - # appdata - appdata_dir = os.getenv('appdata') - if appdata_dir: - userConfFile = os.path.join(appdata_dir, package_name, 'config') - userConf = Config.read_file(userConfFile, default=None) - if userConf is None: - userConfFile += '.txt' - userConf = Config.read_file(userConfFile, default=None) - if userConf is not None: - return userConf, userConfFile + # home (~/package_name.conf or ~/package_name.conf.txt) + user_conf_file = os.path.join(compat_expanduser('~'), '%s.conf' % package_name) + user_conf = Config.read_file(user_conf_file, default=None) + if user_conf is None: + user_conf_file += '.txt' + user_conf = Config.read_file(user_conf_file, default=None) + if user_conf is not None: + return user_conf, user_conf_file - # home - userConfFile = os.path.join(compat_expanduser('~'), '%s.conf' % package_name) - userConf = Config.read_file(userConfFile, default=None) - if userConf is None: - userConfFile += '.txt' - userConf = Config.read_file(userConfFile, default=None) - if userConf is not None: - return userConf, userConfFile + # Package config directories (e.g. ~/.config/package_name/package_name.txt) + user_conf, user_conf_file = _load_from_config_dirs(get_user_config_dirs(package_name)) + if user_conf is not None: + return user_conf, user_conf_file + return default if default is not None else [], None - return default, None + def _read_system_conf(package_name, default=None): + system_conf, system_conf_file = _load_from_config_dirs(get_system_config_dirs(package_name)) + if system_conf is not None: + return system_conf, system_conf_file + return default if default is not None else [], None - def add_config(label, path, user=False): + def add_config(label, path=None, func=None): """ Adds config and returns whether to continue """ if root.parse_known_args()[0].ignoreconfig: return False - # Multiple package names can be given here - # E.g. ('yt-dlp', 'youtube-dlc', 'youtube-dl') will look for - # the configuration file of any of these three packages - for package in ('yt-dlp',): - if user: - args, current_path = _readUserConf(package, default=None) - else: - current_path = os.path.join(path, '%s.conf' % package) - args = Config.read_file(current_path, default=None) - if args is not None: - root.append_config(args, current_path, label=label) - return True + elif func: + assert path is None + args, current_path = func('yt-dlp') + else: + current_path = os.path.join(path, 'yt-dlp.conf') + args = Config.read_file(current_path, default=None) + if args is not None: + root.append_config(args, current_path, label=label) + return True return True def load_configs(): yield not ignore_config_files yield add_config('Portable', get_executable_path()) yield add_config('Home', expand_path(root.parse_known_args()[0].paths.get('home', '')).strip()) - yield add_config('User', None, user=True) - yield add_config('System', '/etc') + yield add_config('User', func=_read_user_conf) + yield add_config('System', func=_read_system_conf) opts = optparse.Values({'verbose': True, 'print_help': False}) try: diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py new file mode 100644 index 0000000000..7d2226d0f1 --- /dev/null +++ b/yt_dlp/plugins.py @@ -0,0 +1,171 @@ +import contextlib +import importlib +import importlib.abc +import importlib.machinery +import importlib.util +import inspect +import itertools +import os +import pkgutil +import sys +import traceback +import zipimport +from pathlib import Path +from zipfile import ZipFile + +from .compat import functools # isort: split +from .compat import compat_expanduser +from .utils import ( + get_executable_path, + get_system_config_dirs, + get_user_config_dirs, + write_string, +) + +PACKAGE_NAME = 'yt_dlp_plugins' +COMPAT_PACKAGE_NAME = 'ytdlp_plugins' + + +class PluginLoader(importlib.abc.Loader): + """Dummy loader for virtual namespace packages""" + + def exec_module(self, module): + return None + + +@functools.cache +def dirs_in_zip(archive): + with ZipFile(archive) as zip: + return set(itertools.chain.from_iterable( + Path(file).parents for file in zip.namelist())) + + +class PluginFinder(importlib.abc.MetaPathFinder): + """ + This class provides one or multiple namespace packages. + It searches in sys.path and yt-dlp config folders for + the existing subdirectories from which the modules can be imported + """ + + def __init__(self, *packages): + self._zip_content_cache = {} + self.packages = set(itertools.chain.from_iterable( + itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b))) + for name in packages)) + + def search_locations(self, fullname): + candidate_locations = [] + + def _get_package_paths(*root_paths, containing_folder='plugins'): + for config_dir in map(Path, root_paths): + plugin_dir = config_dir / containing_folder + if not plugin_dir.is_dir(): + continue + yield from plugin_dir.iterdir() + + # Load from yt-dlp config folders + candidate_locations.extend(_get_package_paths( + *get_user_config_dirs('yt-dlp'), *get_system_config_dirs('yt-dlp'), + containing_folder='plugins')) + + # Load from yt-dlp-plugins folders + candidate_locations.extend(_get_package_paths( + get_executable_path(), + compat_expanduser('~'), + '/etc', + os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config'), + containing_folder='yt-dlp-plugins')) + + candidate_locations.extend(map(Path, sys.path)) # PYTHONPATH + + parts = Path(*fullname.split('.')) + locations = set() + for path in dict.fromkeys(candidate_locations): + candidate = path / parts + if candidate.is_dir(): + locations.add(str(candidate)) + elif path.name and any(path.with_suffix(suffix).is_file() for suffix in {'.zip', '.egg', '.whl'}): + with contextlib.suppress(FileNotFoundError): + if parts in dirs_in_zip(path): + locations.add(str(candidate)) + return locations + + def find_spec(self, fullname, path=None, target=None): + if fullname not in self.packages: + return None + + search_locations = self.search_locations(fullname) + if not search_locations: + return None + + spec = importlib.machinery.ModuleSpec(fullname, PluginLoader(), is_package=True) + spec.submodule_search_locations = search_locations + return spec + + def invalidate_caches(self): + dirs_in_zip.cache_clear() + for package in self.packages: + if package in sys.modules: + del sys.modules[package] + + +def directories(): + spec = importlib.util.find_spec(PACKAGE_NAME) + return spec.submodule_search_locations if spec else [] + + +def iter_modules(subpackage): + fullname = f'{PACKAGE_NAME}.{subpackage}' + with contextlib.suppress(ModuleNotFoundError): + pkg = importlib.import_module(fullname) + yield from pkgutil.iter_modules(path=pkg.__path__, prefix=f'{fullname}.') + + +def load_module(module, module_name, suffix): + return inspect.getmembers(module, lambda obj: ( + inspect.isclass(obj) + and obj.__name__.endswith(suffix) + and obj.__module__.startswith(module_name) + and not obj.__name__.startswith('_') + and obj.__name__ in getattr(module, '__all__', [obj.__name__]))) + + +def load_plugins(name, suffix): + classes = {} + + for finder, module_name, _ in iter_modules(name): + if any(x.startswith('_') for x in module_name.split('.')): + continue + try: + if sys.version_info < (3, 10) and isinstance(finder, zipimport.zipimporter): + # zipimporter.load_module() is deprecated in 3.10 and removed in 3.12 + # The exec_module branch below is the replacement for >= 3.10 + # See: https://docs.python.org/3/library/zipimport.html#zipimport.zipimporter.exec_module + module = finder.load_module(module_name) + else: + spec = finder.find_spec(module_name) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + except Exception: + write_string(f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}') + continue + classes.update(load_module(module, module_name, suffix)) + + # Compat: old plugin system using __init__.py + # Note: plugins imported this way do not show up in directories() + # nor are considered part of the yt_dlp_plugins namespace package + with contextlib.suppress(FileNotFoundError): + spec = importlib.util.spec_from_file_location( + name, Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py')) + plugins = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = plugins + spec.loader.exec_module(plugins) + classes.update(load_module(plugins, spec.name, suffix)) + + return classes + + +sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.extractor', f'{PACKAGE_NAME}.postprocessor')) + +__all__ = ['directories', 'load_plugins', 'PACKAGE_NAME', 'COMPAT_PACKAGE_NAME'] diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index f168be46ad..bfe9df733b 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -33,14 +33,15 @@ from .sponskrub import SponSkrubPP from .sponsorblock import SponsorBlockPP from .xattrpp import XAttrMetadataPP -from ..utils import load_plugins +from ..plugins import load_plugins -_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP', globals()) +_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP') def get_postprocessor(key): return globals()[key + 'PP'] +globals().update(_PLUGIN_CLASSES) __all__ = [name for name in globals().keys() if name.endswith('PP')] __all__.extend(('PostProcessor', 'FFmpegPostProcessor')) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index ee5340cd26..32da598d0f 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -18,7 +18,6 @@ import html.parser import http.client import http.cookiejar -import importlib.util import inspect import io import itertools @@ -5372,22 +5371,37 @@ def get_executable_path(): return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1])) -def load_plugins(name, suffix, namespace): - classes = {} - with contextlib.suppress(FileNotFoundError): - plugins_spec = importlib.util.spec_from_file_location( - name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py')) - plugins = importlib.util.module_from_spec(plugins_spec) - sys.modules[plugins_spec.name] = plugins - plugins_spec.loader.exec_module(plugins) - for name in dir(plugins): - if name in namespace: - continue - if not name.endswith(suffix): - continue - klass = getattr(plugins, name) - classes[name] = namespace[name] = klass - return classes +def get_user_config_dirs(package_name): + locations = set() + + # .config (e.g. ~/.config/package_name) + xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') + config_dir = os.path.join(xdg_config_home, package_name) + if os.path.isdir(config_dir): + locations.add(config_dir) + + # appdata (%APPDATA%/package_name) + appdata_dir = os.getenv('appdata') + if appdata_dir: + config_dir = os.path.join(appdata_dir, package_name) + if os.path.isdir(config_dir): + locations.add(config_dir) + + # home (~/.package_name) + user_config_directory = os.path.join(compat_expanduser('~'), '.%s' % package_name) + if os.path.isdir(user_config_directory): + locations.add(user_config_directory) + + return locations + + +def get_system_config_dirs(package_name): + locations = set() + # /etc/package_name + system_config_directory = os.path.join('/etc', package_name) + if os.path.isdir(system_config_directory): + locations.add(system_config_directory) + return locations def traverse_obj( @@ -6367,3 +6381,10 @@ def calculate_preference(self, format): # Deprecated has_certifi = bool(certifi) has_websockets = bool(websockets) + + +def load_plugins(name, suffix, namespace): + from .plugins import load_plugins + ret = load_plugins(name, suffix) + namespace.update(ret) + return ret diff --git a/ytdlp_plugins/extractor/__init__.py b/ytdlp_plugins/extractor/__init__.py deleted file mode 100644 index 3045a590bd..0000000000 --- a/ytdlp_plugins/extractor/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa: F401 - -# ℹ️ The imported name must end in "IE" -from .sample import SamplePluginIE diff --git a/ytdlp_plugins/extractor/sample.py b/ytdlp_plugins/extractor/sample.py deleted file mode 100644 index a8bc455eb3..0000000000 --- a/ytdlp_plugins/extractor/sample.py +++ /dev/null @@ -1,14 +0,0 @@ -# ⚠ Don't use relative imports -from yt_dlp.extractor.common import InfoExtractor - - -# ℹ️ Instructions on making extractors can be found at: -# 🔗 https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#adding-support-for-a-new-site - -class SamplePluginIE(InfoExtractor): - _WORKING = False - IE_DESC = False - _VALID_URL = r'^sampleplugin:' - - def _real_extract(self, url): - self.to_screen('URL "%s" successfully captured' % url) diff --git a/ytdlp_plugins/postprocessor/__init__.py b/ytdlp_plugins/postprocessor/__init__.py deleted file mode 100644 index 61099abbc6..0000000000 --- a/ytdlp_plugins/postprocessor/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa: F401 - -# ℹ️ The imported name must end in "PP" and is the name to be used in --use-postprocessor -from .sample import SamplePluginPP diff --git a/ytdlp_plugins/postprocessor/sample.py b/ytdlp_plugins/postprocessor/sample.py deleted file mode 100644 index 4563e1c116..0000000000 --- a/ytdlp_plugins/postprocessor/sample.py +++ /dev/null @@ -1,26 +0,0 @@ -# ⚠ Don't use relative imports -from yt_dlp.postprocessor.common import PostProcessor - - -# ℹ️ See the docstring of yt_dlp.postprocessor.common.PostProcessor -class SamplePluginPP(PostProcessor): - def __init__(self, downloader=None, **kwargs): - # ⚠ Only kwargs can be passed from the CLI, and all argument values will be string - # Also, "downloader", "when" and "key" are reserved names - super().__init__(downloader) - self._kwargs = kwargs - - # ℹ️ See docstring of yt_dlp.postprocessor.common.PostProcessor.run - def run(self, info): - if info.get('_type', 'video') != 'video': # PP was called for playlist - self.to_screen(f'Post-processing playlist {info.get("id")!r} with {self._kwargs}') - elif info.get('filepath'): # PP was called after download (default) - filepath = info.get('filepath') - self.to_screen(f'Post-processed {filepath!r} with {self._kwargs}') - elif info.get('requested_downloads'): # PP was called after_video - filepaths = [f.get('filepath') for f in info.get('requested_downloads')] - self.to_screen(f'Post-processed {filepaths!r} with {self._kwargs}') - else: # PP was called before actual download - filepath = info.get('_filename') - self.to_screen(f'Pre-processed {filepath!r} with {self._kwargs}') - return [], info # return list_of_files_to_delete, info_dict From 3e01ce744a981d8f19ae77ec695005e7000f4703 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 1 Jan 2023 18:40:26 +1300 Subject: [PATCH 089/153] [extractor/generic] Use `Accept-Encoding: identity` for initial request The existing comment seems to imply this was the desired behavior from the beginning. Partial fix for https://github.com/yt-dlp/yt-dlp/issues/5855, https://github.com/yt-dlp/yt-dlp/issues/5851, https://github.com/yt-dlp/yt-dlp/issues/4748 --- yt_dlp/extractor/generic.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 2281c71f3d..ffc2790230 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2154,6 +2154,21 @@ class GenericIE(InfoExtractor): 'age_limit': 0, 'direct': True, } + }, { + 'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.', + 'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', + 'info_dict': { + 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', + 'ext': 'mp4', + 'title': 'čauky lidi 70 finall', + 'description': 'čauky lidi 70 finall', + 'thumbnail': 'h', + 'upload_date': '20220606', + 'timestamp': 1654513791, + 'duration': 318.0, + 'direct': True, + 'age_limit': 0, + } } ] @@ -2312,7 +2327,7 @@ def _real_extract(self, url): # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. full_response = self._request_webpage(url, video_id, headers={ - 'Accept-Encoding': '*', + 'Accept-Encoding': 'identity', **smuggled_data.get('http_headers', {}) }) new_url = full_response.geturl() From 1cdda3299810b86206853a22e680758eadcc4e05 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 14:11:14 +0530 Subject: [PATCH 090/153] [utils] `get_exe_version`: Detect broken executables Authored by: dirkf, pukkandan Closes #5561 --- yt_dlp/utils.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 32da598d0f..5af176b364 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2720,8 +2720,10 @@ def _get_exe_version_output(exe, args): # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if yt-dlp is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if ret: + return None except OSError: return False return stdout @@ -2739,11 +2741,15 @@ def detect_exe_version(output, version_re=None, unrecognized='present'): def get_exe_version(exe, args=['--version'], - version_re=None, unrecognized='present'): + version_re=None, unrecognized=('present', 'broken')): """ Returns the version of the specified executable, or False if the executable is not present """ + unrecognized = variadic(unrecognized) + assert len(unrecognized) in (1, 2) out = _get_exe_version_output(exe, args) - return detect_exe_version(out, version_re, unrecognized) if out else False + if out is None: + return unrecognized[-1] + return out and detect_exe_version(out, version_re, unrecognized[0]) def frange(start=0, stop=None, step=1): From 88fb9425775da7f92d24e8b5f3009cafb56e94d6 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 13:32:05 +0530 Subject: [PATCH 091/153] Add message when there are no subtitles/thumbnails Closes #5551 --- yt_dlp/YoutubeDL.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9ef56a46b6..866d069b76 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3930,7 +3930,7 @@ def _write_description(self, label, ie_result, descfn): elif not self.params.get('overwrites', True) and os.path.exists(descfn): self.to_screen(f'[info] {label.title()} description is already present') elif ie_result.get('description') is None: - self.report_warning(f'There\'s no {label} description to write') + self.to_screen(f'[info] There\'s no {label} description to write') return False else: try: @@ -3946,15 +3946,18 @@ def _write_subtitles(self, info_dict, filename): ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error''' ret = [] subtitles = info_dict.get('requested_subtitles') - if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): + if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE return ret - + elif not subtitles: + self.to_screen('[info] There\'s no subtitles for the requested languages') + return ret sub_filename_base = self.prepare_filename(info_dict, 'subtitle') if not sub_filename_base: self.to_screen('[info] Skipping writing video subtitles') return ret + for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -4001,6 +4004,9 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None thumbnails, ret = [], [] if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] + if not thumbnails: + self.to_screen(f'[info] There\'s no {label} thumbnails to download') + return ret multiple = write_all and len(thumbnails) > 1 if thumb_filename_base is None: From 2a06bb4eb671eb306a2687ef0a4f853b936f05e0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 13:42:43 +0530 Subject: [PATCH 092/153] Add `--compat-options 2021,2022` Use these to guard against future compat changes. This allows devs to change defaults and make other potentially breaking changes more easily. If you need everything to work exactly as-is, put this in your config --- README.md | 2 ++ yt_dlp/options.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 4294090dc5..f6bf1175e2 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,8 @@ ### Differences in default behavior * `--compat-options all`: Use all compat options (Do NOT use) * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect` +* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` +* `--compat-options 2022`: Currently does nothing. Use this to enable all future compat options # INSTALLATION diff --git a/yt_dlp/options.py b/yt_dlp/options.py index be4695cbb5..e9766c02d7 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -470,6 +470,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): }, 'aliases': { 'youtube-dl': ['all', '-multistreams'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], + '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], + '2022': [], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' From 78d25e0b7c2b45597e193c0decb33f4f248502a9 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 14:10:51 +0530 Subject: [PATCH 093/153] [extractor/embedly] Handle vimeo embeds Closes #3360 --- yt_dlp/extractor/embedly.py | 62 +++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py index 483d018bb4..db5ef055ec 100644 --- a/yt_dlp/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py @@ -1,13 +1,63 @@ import re import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from .youtube import YoutubeTabIE +from ..utils import parse_qs, smuggle_url, traverse_obj class EmbedlyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P[^#&]+)' + _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)' _TESTS = [{ 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1', + 'info_dict': { + 'id': 'UUGLim4T2loE5rwCMdpCIPVg', + 'modified_date': '20221225', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic', + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'uploader': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/@TraciHinesMusic', + 'channel': 'TraciJHines', + 'availability': 'public', + 'uploader_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'description': '', + 'tags': [], + 'title': 'Uploads from TraciJHines', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1', + 'params': {'noplaylist': True}, + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'age_limit': 0, + 'categories': ['Entertainment'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/SU4fj_aEMVw/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'TraciJHines', + 'uploader_id': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/channel/UCGLim4T2loE5rwCMdpCIPVg', + 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'upload_date': '20150211', + 'duration': 282, + 'availability': 'public', + 'channel_follower_count': int, + 'tags': 'count:39', + 'view_count': int, + 'comment_count': int, + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'like_count': int, + 'uploader': 'TraciJHines', + 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', + 'chapters': list, + + }, + }, { + 'url': 'https://cdn.embedly.com/widgets/media.html?src=https://player.vimeo.com/video/1234567?h=abcdefgh', 'only_matching': True, }] @@ -21,4 +71,10 @@ def _extract_embed_urls(cls, url, webpage): yield urllib.parse.unquote(mobj.group('url')) def _real_extract(self, url): - return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) + qs = parse_qs(url) + src = urllib.parse.unquote(traverse_obj(qs, ('url', 0)) or '') + if src and YoutubeTabIE.suitable(src): + return self.url_result(src, YoutubeTabIE) + return self.url_result(smuggle_url( + urllib.parse.unquote(traverse_obj(qs, ('src', 0), ('url', 0))), + {'http_headers': {'Referer': url}})) From 26fdfc3704a278acada27cc420d67c6d3f71423b Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 14:39:58 +0530 Subject: [PATCH 094/153] [extractor/biliintl:series] Make partial download of series faster --- yt_dlp/extractor/bilibili.py | 51 +++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 92620f697b..3274a427da 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -20,9 +20,11 @@ parse_count, parse_qs, qualities, + smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -881,16 +883,12 @@ def _get_formats(self, *, ep_id=None, aid=None): return formats - def _extract_video_info(self, video_data, *, ep_id=None, aid=None): + def _parse_video_metadata(self, video_data): return { - 'id': ep_id or aid, 'title': video_data.get('title_display') or video_data.get('title'), 'thumbnail': video_data.get('cover'), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), - 'formats': self._get_formats(ep_id=ep_id, aid=aid), - 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), - 'extractor_key': BiliIntlIE.ie_key(), } def _perform_login(self, username, password): @@ -975,9 +973,16 @@ class BiliIntlIE(BiliIntlBaseIE): 'only_matching': True, }] - def _real_extract(self, url): - season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') - video_id = ep_id or aid + def _make_url(video_id, series_id=None): + if series_id: + return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}' + return f'https://www.bilibili.tv/en/video/{video_id}' + + def _extract_video_metadata(self, url, video_id, season_id): + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('title'): + return smuggled_data + webpage = self._download_webpage(url, video_id) # Bstation layout initial_data = ( @@ -989,13 +994,26 @@ def _real_extract(self, url): if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - video_data = traverse_obj(season_json, - ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), - expected_type=dict, get_all=False) - return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) + video_data = traverse_obj(season_json, ( + 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id + ), expected_type=dict, get_all=False) + + return self._parse_video_metadata(video_data) + + def _real_extract(self, url): + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + + return { + 'id': video_id, + **self._extract_video_metadata(url, video_id, season_id), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), + } class BiliIntlSeriesIE(BiliIntlBaseIE): + IE_NAME = 'biliintl:series' _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', @@ -1021,9 +1039,12 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): def _entries(self, series_id): series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) - for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): - episode_id = str(episode.get('episode_id')) - yield self._extract_video_info(episode, ep_id=episode_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict): + episode_id = str(episode['episode_id']) + yield self.url_result(smuggle_url( + BiliIntlIE._make_url(episode_id, series_id), + self._parse_video_metadata(episode) + ), BiliIntlIE, episode_id) def _real_extract(self, url): series_id = self._match_id(url) From 193fb150b76c4aaf41fb2c98b073e7e1f8a108f0 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 1 Jan 2023 17:01:48 +0530 Subject: [PATCH 095/153] Fix bug in 119e40ef64b25f66a39246e87ce6c143cd34276d --- yt_dlp/YoutubeDL.py | 3 ++- yt_dlp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 866d069b76..8ce71a2dc6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3460,7 +3460,8 @@ def run_pp(self, pp, infodict): return infodict def run_all_pps(self, key, info, *, additional_pps=None): - self._forceprint(key, info) + if key != 'video': + self._forceprint(key, info) for pp in (additional_pps or []) + self._pps[key]: info = self.run_pp(pp, info) return info diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3490816c4c..9cb1324105 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -703,7 +703,7 @@ def parse_options(argv=None): postprocessors = list(get_postprocessors(opts)) - print_only = bool(opts.forceprint) and all(k not in opts.forceprint for k in POSTPROCESS_WHEN[2:]) + print_only = bool(opts.forceprint) and all(k not in opts.forceprint for k in POSTPROCESS_WHEN[3:]) any_getting = any(getattr(opts, k) for k in ( 'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename', 'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl' From 8c53322cda75394a8d551dde20b2529ee5ad6e89 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Mon, 2 Jan 2023 02:16:25 +0900 Subject: [PATCH 096/153] [downloader/aria2c] Native progress for aria2c via RPC (#3724) Authored by: Lesmiscore, pukkandan Closes #2038 --- README.md | 3 +- yt_dlp/downloader/external.py | 109 ++++++++++++++++++++++++++++++++-- yt_dlp/options.py | 6 +- yt_dlp/utils.py | 9 +++ 4 files changed, 119 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f6bf1175e2..83e69a236b 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,7 @@ ### Differences in default behavior * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior +* yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: `aria2c`). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is For ease of use, a few more compat options are available: @@ -160,7 +161,7 @@ ### Differences in default behavior * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Currently does nothing. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options no-external-downloader-progress`. Use this to enable all future compat options # INSTALLATION diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 5751383712..569839f6f4 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -1,9 +1,11 @@ import enum +import json import os.path import re import subprocess import sys import time +import uuid from .fragment import FragmentFD from ..compat import functools @@ -20,8 +22,10 @@ determine_ext, encodeArgument, encodeFilename, + find_available_port, handle_youtubedl_headers, remove_end, + sanitized_Request, traverse_obj, ) @@ -60,7 +64,6 @@ def real_download(self, filename, info_dict): } if filename != '-': fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(f'\r[{self.get_basename()}] Downloaded {fsize} bytes') self.try_rename(tmpfilename, filename) status.update({ 'downloaded_bytes': fsize, @@ -129,8 +132,7 @@ def _call_downloader(self, tmpfilename, info_dict): self._debug_cmd(cmd) if 'fragments' not in info_dict: - _, stderr, returncode = Popen.run( - cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) + _, stderr, returncode = self._call_process(cmd, info_dict) if returncode and stderr: self.to_stderr(stderr) return returncode @@ -140,7 +142,7 @@ def _call_downloader(self, tmpfilename, info_dict): retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, frag_index=None, fatal=not skip_unavailable_fragments) for retry in retry_manager: - _, stderr, returncode = Popen.run(cmd, text=True, stderr=subprocess.PIPE) + _, stderr, returncode = self._call_process(cmd, info_dict) if not returncode: break # TODO: Decide whether to retry based on error code @@ -172,6 +174,9 @@ def _call_downloader(self, tmpfilename, info_dict): self.try_remove(encodeFilename('%s.frag.urls' % tmpfilename)) return 0 + def _call_process(self, cmd, info_dict): + return Popen.run(cmd, text=True, stderr=subprocess.PIPE) + class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -256,6 +261,14 @@ def supports_manifest(manifest): def _aria2c_filename(fn): return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}' + def _call_downloader(self, tmpfilename, info_dict): + if 'no-external-downloader-progress' not in self.params.get('compat_opts', []): + info_dict['__rpc'] = { + 'port': find_available_port() or 19190, + 'secret': str(uuid.uuid4()), + } + return super()._call_downloader(tmpfilename, info_dict) + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', @@ -276,6 +289,12 @@ def _make_cmd(self, tmpfilename, info_dict): cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') cmd += self._configuration_args() + if '__rpc' in info_dict: + cmd += [ + '--enable-rpc', + f'--rpc-listen-port={info_dict["__rpc"]["port"]}', + f'--rpc-secret={info_dict["__rpc"]["secret"]}'] + # aria2c strips out spaces from the beginning/end of filenames and paths. # We work around this issue by adding a "./" to the beginning of the # filename and relative path, and adding a "/" at the end of the path. @@ -304,6 +323,88 @@ def _make_cmd(self, tmpfilename, info_dict): cmd += ['--', info_dict['url']] return cmd + def aria2c_rpc(self, rpc_port, rpc_secret, method, params=()): + # Does not actually need to be UUID, just unique + sanitycheck = str(uuid.uuid4()) + d = json.dumps({ + 'jsonrpc': '2.0', + 'id': sanitycheck, + 'method': method, + 'params': [f'token:{rpc_secret}', *params], + }).encode('utf-8') + request = sanitized_Request( + f'http://localhost:{rpc_port}/jsonrpc', + data=d, headers={ + 'Content-Type': 'application/json', + 'Content-Length': f'{len(d)}', + 'Ytdl-request-proxy': '__noproxy__', + }) + with self.ydl.urlopen(request) as r: + resp = json.load(r) + assert resp.get('id') == sanitycheck, 'Something went wrong with RPC server' + return resp['result'] + + def _call_process(self, cmd, info_dict): + if '__rpc' not in info_dict: + return super()._call_process(cmd, info_dict) + + send_rpc = functools.partial(self.aria2c_rpc, info_dict['__rpc']['port'], info_dict['__rpc']['secret']) + started = time.time() + + fragmented = 'fragments' in info_dict + frag_count = len(info_dict['fragments']) if fragmented else 1 + status = { + 'filename': info_dict.get('_filename'), + 'status': 'downloading', + 'elapsed': 0, + 'downloaded_bytes': 0, + 'fragment_count': frag_count if fragmented else None, + 'fragment_index': 0 if fragmented else None, + } + self._hook_progress(status, info_dict) + + def get_stat(key, *obj, average=False): + val = tuple(filter(None, map(float, traverse_obj(obj, (..., ..., key))))) or [0] + return sum(val) / (len(val) if average else 1) + + with Popen(cmd, text=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) as p: + # Add a small sleep so that RPC client can receive response, + # or the connection stalls infinitely + time.sleep(0.2) + retval = p.poll() + while retval is None: + # We don't use tellStatus as we won't know the GID without reading stdout + # Ref: https://aria2.github.io/manual/en/html/aria2c.html#aria2.tellActive + active = send_rpc('aria2.tellActive') + completed = send_rpc('aria2.tellStopped', [0, frag_count]) + + downloaded = get_stat('totalLength', completed) + get_stat('completedLength', active) + speed = get_stat('downloadSpeed', active) + total = frag_count * get_stat('totalLength', active, completed, average=True) + if total < downloaded: + total = None + + status.update({ + 'downloaded_bytes': int(downloaded), + 'speed': speed, + 'total_bytes': None if fragmented else total, + 'total_bytes_estimate': total, + 'eta': (total - downloaded) / (speed or 1), + 'fragment_index': min(frag_count, len(completed) + 1) if fragmented else None, + 'elapsed': time.time() - started + }) + self._hook_progress(status, info_dict) + + if not active and len(completed) >= frag_count: + send_rpc('aria2.shutdown') + retval = p.wait() + break + + time.sleep(0.1) + retval = p.poll() + + return '', p.stderr.read(), retval + class HttpieFD(ExternalFD): AVAILABLE_OPT = '--version' diff --git a/yt_dlp/options.py b/yt_dlp/options.py index e9766c02d7..5bbb292dee 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -464,14 +464,14 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'allowed_values': { 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', - 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', - 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', + 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', + 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', }, 'aliases': { 'youtube-dl': ['all', '-multistreams'], 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': [], + '2022': ['no-external-downloader-progress'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 5af176b364..45a7e6eaa5 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5243,6 +5243,15 @@ def random_birthday(year_field, month_field, day_field): } +def find_available_port(interface=''): + try: + with socket.socket() as sock: + sock.bind((interface, 0)) + return sock.getsockname()[1] + except OSError: + return None + + # Templates for internet shortcut files, which are plain text files. DOT_URL_LINK_TEMPLATE = '''\ [InternetShortcut] From e756f45ba0648f972be71ce328419a623e381028 Mon Sep 17 00:00:00 2001 From: Matthew Date: Mon, 2 Jan 2023 04:55:11 +0000 Subject: [PATCH 097/153] Improve handling for overriding extractors with plugins (#5916) * Extractors replaced with plugin extractors now show in debug output * Better testcase handling * Added documentation Authored by: coletdjnz, pukkandan --- README.md | 9 ++++++--- yt_dlp/YoutubeDL.py | 22 +++++++++++++++------- yt_dlp/extractor/common.py | 13 +++++++++++-- yt_dlp/extractor/extractors.py | 2 ++ yt_dlp/extractor/testurl.py | 11 ++++++----- 5 files changed, 40 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 83e69a236b..c4bd6ef0c7 100644 --- a/README.md +++ b/README.md @@ -1841,7 +1841,7 @@ ## Installing Plugins * Source: where `/yt_dlp/__main__.py`, `/yt-dlp-plugins//yt_dlp_plugins/` 3. **pip and other locations in `PYTHONPATH`** - * Plugin packages can be installed and managed using `pip`. See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. + * Plugin packages can be installed and managed using `pip`. See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. * Note: plugin files between plugin packages installed with pip must have unique filenames * Any path in `PYTHONPATH` is searched in for the `yt_dlp_plugins` namespace folder. * Note: This does not apply for Pyinstaller/py2exe builds. @@ -1854,9 +1854,12 @@ ## Installing Plugins ## Developing Plugins -See [ytdlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for a sample plugin package with instructions on how to set up an environment for plugin development. +See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for a sample plugin package with instructions on how to set up an environment for plugin development. -All public classes with a name ending in `IE` are imported from each file. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`) +All public classes with a name ending in `IE`/`PP` are imported from each file for extractors and postprocessors repectively. This respects underscore prefix (e.g. `_MyBasePluginIE` is private) and `__all__`. Modules can similarly be excluded by prefixing the module name with an underscore (e.g. `_myplugin.py`) + +To replace an existing extractor with a subclass of one, set the `plugin_name` class keyword argument (e.g. `MyPluginIE(ABuiltInIE, plugin_name='myplugin')` will replace `ABuiltInIE` with `MyPluginIE`). +Due to the mechanics behind this, you should exclude the subclass extractor from being imported separately by making it private using one of the methods described above. If you are a plugin author, add [yt-dlp-plugins](https://github.com/topics/yt-dlp-plugins) as a topic to your repository for discoverability diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8ce71a2dc6..e7b4690590 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -33,7 +33,7 @@ from .extractor.openload import PhantomJSwrapper from .minicurses import format_text from .plugins import directories as plugin_directories -from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors +from .postprocessor import _PLUGIN_CLASSES as plugin_pps from .postprocessor import ( EmbedThumbnailPP, FFmpegFixupDuplicateMoovPP, @@ -3730,7 +3730,10 @@ def print_debug_header(self): # These imports can be slow. So import them only as needed from .extractor.extractors import _LAZY_LOADER - from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors + from .extractor.extractors import ( + _PLUGIN_CLASSES as plugin_ies, + _PLUGIN_OVERRIDES as plugin_ie_overrides + ) def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) @@ -3808,12 +3811,17 @@ def get_encoding(stream): proxy_map.update(handler.proxies) write_debug(f'Proxy map: {proxy_map}') - for plugin_type, plugins in {'Extractor': plugin_extractors, 'Post-Processor': plugin_postprocessors}.items(): - if not plugins: - continue - write_debug(f'{plugin_type} Plugins: %s' % (', '.join(sorted(('%s%s' % ( + for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): + display_list = ['%s%s' % ( klass.__name__, '' if klass.__name__ == name else f' as {name}') - for name, klass in plugins.items()))))) + for name, klass in plugins.items()] + if plugin_type == 'Extractor': + display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' + for parent, plugins in plugin_ie_overrides.items()) + if not display_list: + continue + write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') + plugin_dirs = plugin_directories() if plugin_dirs: write_debug(f'Plugin directories: {plugin_dirs}') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 9031f3c116..f48b97a6b6 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3442,13 +3442,17 @@ def get_testcases(cls, include_onlymatching=False): continue t['name'] = cls.ie_key() yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_testcases(include_onlymatching) @classmethod def get_webpage_testcases(cls): tests = vars(cls).get('_WEBPAGE_TESTS', []) for t in tests: t['name'] = cls.ie_key() - return tests + yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_webpage_testcases() @classproperty(cache=True) def age_limit(cls): @@ -3710,10 +3714,12 @@ def __init_subclass__(cls, *, plugin_name=None, **kwargs): if plugin_name: mro = inspect.getmro(cls) super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] - cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key + cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}' while getattr(super_class, '__wrapped__', None): super_class = super_class.__wrapped__ setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + _PLUGIN_OVERRIDES[super_class].append(cls) return super().__init_subclass__(**kwargs) @@ -3770,3 +3776,6 @@ class UnsupportedURLIE(InfoExtractor): def _real_extract(self, url): raise UnsupportedError(url) + + +_PLUGIN_OVERRIDES = collections.defaultdict(list) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index beda02917e..baa69d2421 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -24,3 +24,5 @@ globals().update(_PLUGIN_CLASSES) _ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() + +from .common import _PLUGIN_OVERRIDES # noqa: F401 diff --git a/yt_dlp/extractor/testurl.py b/yt_dlp/extractor/testurl.py index dccca10046..0da01aa53e 100644 --- a/yt_dlp/extractor/testurl.py +++ b/yt_dlp/extractor/testurl.py @@ -23,11 +23,12 @@ def _real_extract(self, url): if len(matching_extractors) == 0: raise ExtractorError(f'No extractors matching {extractor_id!r} found', expected=True) elif len(matching_extractors) > 1: - try: # Check for exact match - extractor = next( - ie for ie in matching_extractors - if ie.IE_NAME.lower() == extractor_id.lower()) - except StopIteration: + extractor = next(( # Check for exact match + ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower() + ), None) or next(( # Check for exact match without plugin suffix + ie for ie in matching_extractors if ie.IE_NAME.split('+')[0].lower() == extractor_id.lower() + ), None) + if not extractor: raise ExtractorError( 'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors), expected=True) From b23b503e22ff577d23920e877ee73da478bb4c6f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:44:54 +0000 Subject: [PATCH 098/153] [extractor/odnoklassniki] Extract subtitles (#5920) Closes #5744 Authored by: bashonly --- yt_dlp/extractor/odnoklassniki.py | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 4f325f0878..4b73eed37e 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -11,6 +11,7 @@ int_or_none, qualities, smuggle_url, + traverse_obj, unescapeHTML, unified_strdate, unsmuggle_url, @@ -153,6 +154,26 @@ class OdnoklassnikiIE(InfoExtractor): 'title': 'Быковское крещение', 'duration': 3038.181, }, + 'skip': 'HTTP Error 400', + }, { + 'note': 'subtitles', + 'url': 'https://ok.ru/video/4249587550747', + 'info_dict': { + 'id': '4249587550747', + 'ext': 'mp4', + 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle', + 'uploader': 'Sunflower Movies', + 'uploader_id': '595802161179', + 'upload_date': '20220816', + 'duration': 6728, + 'age_limit': 0, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', + 'like_count': int, + 'subtitles': dict, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -202,6 +223,7 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': 0, 'duration': 10444, }, + 'skip': 'Site no longer embeds', }] @classmethod @@ -294,6 +316,16 @@ def _extract_desktop(self, url): like_count = int_or_none(metadata.get('likeCount')) + subtitles = {} + for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('language') or 'en', []).append({ + 'url': sub_url, + 'ext': 'vtt', + }) + info = { 'id': video_id, 'title': title, @@ -305,6 +337,7 @@ def _extract_desktop(self, url): 'like_count': like_count, 'age_limit': age_limit, 'start_time': start_time, + 'subtitles': subtitles, } # pladform From 13f930abc0c91d8e50336488e4c55defe97aa588 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:46:06 +0000 Subject: [PATCH 099/153] [extractor/fifa] Fix Preplay extraction (#5921) Closes #5839 Authored by: dirkf --- yt_dlp/extractor/fifa.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/fifa.py b/yt_dlp/extractor/fifa.py index dc00edcb31..8b4db3a8ae 100644 --- a/yt_dlp/extractor/fifa.py +++ b/yt_dlp/extractor/fifa.py @@ -17,8 +17,10 @@ class FifaIE(InfoExtractor): 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', 'ext': 'mp4', 'categories': ['FIFA Tournaments'], - 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA', + 'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero', 'duration': 8165, + 'release_timestamp': 1152403200, + 'release_date': '20060709', }, 'params': {'skip_download': 'm3u8'}, }, { @@ -54,7 +56,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) preconnect_link = self._search_regex( - r']+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + r']+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') video_details = self._download_json( f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False) @@ -62,22 +64,9 @@ def _real_extract(self, url): preplay_parameters = self._download_json( f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters'] - cid = preplay_parameters['contentId'] content_data = self._download_json( - f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={ - 'v': preplay_parameters['preplayAPIVersion'], - 'tc': preplay_parameters['tokenCheckAlgorithmVersion'], - 'rn': preplay_parameters['randomNumber'], - 'exp': preplay_parameters['tokenExpirationDate'], - 'ct': preplay_parameters['contentType'], - 'cid': cid, - 'mbtracks': preplay_parameters['tracksAssetNumber'], - 'ad': preplay_parameters['adConfiguration'], - 'ad.preroll': int(preplay_parameters['adPreroll']), - 'ad.cmsid': preplay_parameters['adCMSSourceId'], - 'ad.vid': preplay_parameters['adSourceVideoID'], - 'sig': preplay_parameters['signature'], - }) + 'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters), + video_id, 'Downloading Content Data') formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) From d7f98714696a4c9691ed28fb9b63395b9227646a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:50:37 +0000 Subject: [PATCH 100/153] [extractor/iqiyi] Fix `Iq` JS regex (#5922) Closes #5702 Authored by: bashonly --- yt_dlp/extractor/iqiyi.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index dbc688fb92..eba89f787e 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -527,11 +527,14 @@ def _extract_vms_player_js(self, webpage, video_id): webpack_js_url = self._proto_relative_url(self._search_regex( r').*?\.setup\s*\((?P[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -3237,19 +3243,20 @@ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - entries = [] + if not isinstance(jwplayer_data, dict): + return entries - # JWPlayer backward compatibility: single playlist item + playlist_items = jwplayer_data.get('playlist') + # JWPlayer backward compatibility: single playlist item/flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if not isinstance(playlist_items, list): + playlist_items = (playlist_items or jwplayer_data, ) - for video_data in jwplayer_data['playlist']: + for video_data in playlist_items: + if not isinstance(video_data, dict): + continue # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: @@ -3287,6 +3294,13 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -3304,7 +3318,7 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -3313,14 +3327,14 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -3335,13 +3349,12 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), @@ -3349,6 +3362,7 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'filesize': int_or_none(source.get('filesize')), 'ext': ext, + 'format_id': format_id } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index ffc2790230..14d492f075 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -32,6 +32,7 @@ unified_timestamp, unsmuggle_url, url_or_none, + urljoin, variadic, xpath_attr, xpath_text, @@ -1867,11 +1868,13 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'description': 'Kelis - 4th Of July', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Untested major version'], }, { # KVS Player 'url': 'https://www.kvs-demo.com/embed/105/', @@ -1880,35 +1883,12 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July / Embed Player', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, }, { - # KVS Player - 'url': 'https://thisvid.com/videos/french-boy-pantsed/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player - 'url': 'https://thisvid.com/embed/2400174/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player 'url': 'https://youix.com/video/leningrad-zoj/', 'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'info_dict': { @@ -1916,8 +1896,8 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://youix.com/embed/18485', @@ -1927,19 +1907,20 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Ленинград - ЗОЖ', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', 'md5': '94166bdb26b4cb1fb9214319a629fc51', 'info_dict': { 'id': '21217', - 'display_id': '40-nochey-40-nights-2016', + 'display_id': '40-nochey-2016', 'ext': 'mp4', 'title': '40 ночей (2016) - BogMedia.org', + 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', - } + }, }, { # KVS Player (for sites that serve kt_player.js via non-https urls) @@ -1949,9 +1930,9 @@ class GenericIE(InfoExtractor): 'id': '389508', 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', 'ext': 'mp4', - 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', - 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', - } + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', + }, }, { # Reddit-hosted video that will redirect and be processed by RedditIE @@ -2169,7 +2150,20 @@ class GenericIE(InfoExtractor): 'direct': True, 'age_limit': 0, } - } + }, + { + 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', + 'md5': 'e2f0a4c329f7986280b7328e24036d60', + 'info_dict': { + 'id': '284002', + 'display_id': 'just-out-of-the-shower-joi', + 'ext': 'mp4', + 'title': 'Just Out Of The Shower JOI - Shooshtime', + 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg', + 'height': 720, + 'age_limit': 18, + }, + }, ] def report_following_redirect(self, new_url): @@ -2235,43 +2229,87 @@ def itunes(key): 'entries': entries, } - def _kvs_getrealurl(self, video_url, license_code): + @classmethod + def _kvs_get_real_url(cls, video_url, license_code): if not video_url.startswith('function/0/'): return video_url # not obfuscated - url_path, _, url_query = video_url.partition('?') - urlparts = url_path.split('/')[2:] - license = self._kvs_getlicensetoken(license_code) - newmagic = urlparts[5][:32] + parsed = urllib.parse.urlparse(video_url[len('function/0/'):]) + license = cls._kvs_get_license_token(license_code) + urlparts = parsed.path.split('/') - for o in range(len(newmagic) - 1, -1, -1): - new = '' - l = (o + sum(int(n) for n in license[o:])) % 32 + HASH_LENGTH = 32 + hash = urlparts[3][:HASH_LENGTH] + indices = list(range(HASH_LENGTH)) - for i in range(0, len(newmagic)): - if i == o: - new += newmagic[l] - elif i == l: - new += newmagic[o] - else: - new += newmagic[i] - newmagic = new + # Swap indices of hash according to the destination calculated from the license token + accum = 0 + for src in reversed(range(HASH_LENGTH)): + accum += license[src] + dest = (src + accum) % HASH_LENGTH + indices[src], indices[dest] = indices[dest], indices[src] - urlparts[5] = newmagic + urlparts[5][32:] - return '/'.join(urlparts) + '?' + url_query + urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:] + return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts))) - def _kvs_getlicensetoken(self, license): - modlicense = license.replace('$', '').replace('0', '1') - center = int(len(modlicense) / 2) + @staticmethod + def _kvs_get_license_token(license): + license = license.replace('$', '') + license_values = [int(char) for char in license] + + modlicense = license.replace('0', '1') + center = len(modlicense) // 2 fronthalf = int(modlicense[:center + 1]) backhalf = int(modlicense[center:]) + modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1] - modlicense = str(4 * abs(fronthalf - backhalf)) - retval = '' - for o in range(0, center + 1): - for i in range(1, 5): - retval += str((int(license[o + i]) + int(modlicense[o])) % 10) - return retval + return [ + (license_values[index + offset] + current) % 10 + for index, current in enumerate(map(int, modlicense)) + for offset in range(4) + ] + + def _extract_kvs(self, url, webpage, video_id): + flashvars = self._search_json( + r'(?s:]*>.*?var\s+flashvars\s*=)', + webpage, 'flashvars', video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:' + r'|)', + webpage, 'display_id', fatal=False) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) + formats = [] + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(f'{key}_text', key) + formats.append({ + 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])), + 'format_id': format_id, + 'ext': 'mp4', + **(parse_resolution(format_id) or parse_resolution(flashvars[key])), + 'http_headers': {'Referer': url}, + }) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 + + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } def _real_extract(self, url): if url.startswith('//'): @@ -2580,6 +2618,17 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): self.report_detected('video.js embed') return [{'formats': formats, 'subtitles': subtitles}] + # Look for generic KVS player (before json-ld bc of some urls that break otherwise) + found = self._search_regex(( + r']+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P\d+(?:\.\d+)+)\1[^>]*>', + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) + if found: + self.report_detected('KWS Player') + if found.split('.')[0] not in ('4', '5', '6'): + self.report_warning(f'Untested major version ({found}) in player engine - download may fail.') + return [self._extract_kvs(url, webpage, video_id)] + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): @@ -2622,52 +2671,6 @@ def filter_video(urls): ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) if found: self.report_detected('JW Player embed') - if not found: - # Look for generic KVS player - found = re.search(r'', webpage) - flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json) - - # extract the part after the last / as the display_id from the - # canonical URL. - display_id = self._search_regex( - r'(?:' - r'|)', - webpage, 'display_id', fatal=False - ) - title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)', webpage, 'title') - - thumbnail = flashvars['preview_url'] - if thumbnail.startswith('//'): - protocol, _, _ = url.partition('/') - thumbnail = protocol + thumbnail - - url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys())) - formats = [] - for key in url_keys: - if '/get_file/' not in flashvars[key]: - continue - format_id = flashvars.get(f'{key}_text', key) - formats.append({ - 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), - 'format_id': format_id, - 'ext': 'mp4', - **(parse_resolution(format_id) or parse_resolution(flashvars[key])) - }) - if not formats[-1].get('height'): - formats[-1]['quality'] = 1 - - return [{ - 'id': flashvars['video_id'], - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - }] if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index 2d9b9a7425..d1fc058b92 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -1,71 +1,128 @@ +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) -class PeekVidsIE(InfoExtractor): +class PeekVidsBaseIE(InfoExtractor): + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).group('domain', 'id') + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}', + video_id=video_id, expected=True) + + title = self._html_search_regex(r'(?s)]*>(.+?)

', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + f'https://www.{domain}/v-alt/{video_id}', video_id, + note='Downloading list of source files') + + formats = [] + for k, v in srcs.items(): + f_url = url_or_none(v) + if not f_url: + continue + + height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None) + if not height: + continue + + formats.append({ + 'url': f_url, + 'format_id': height, + 'height': int_or_none(height), + }) + + if not formats: + formats = [{'url': url} for url in srcs.values()] + + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) + + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = (get_element_by_class('detail-video-block', webpage) + or get_element_by_class('detail-block', webpage) or '') + info['description'] = self._html_search_regex( + rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|]*>\s*{re.escape(name)}\s*:\s*(.+?)', + html, name, default='') + return list(filter(None, re.split(r'\s+', l))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PeekVidsIE(PeekVidsBaseIE): _VALID_URL = r'''(?x) - https?://(?:www\.)?peekvids\.com/ + https?://(?:www\.)?(?Ppeekvids\.com)/ (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) (?P[^/?&#]*) ''' _TESTS = [{ 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', - 'md5': 'a00940646c428e232407e3e62f0e8ef5', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', 'info_dict': { - 'id': 'BSyLMbN0YCd', - 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1642579329, 'upload_date': '20220119', 'duration': 416, 'view_count': int, 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, }, }] - _DOMAIN = 'www.peekvids.com' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - short_video_id = self._html_search_regex(r'