]*>\s*([^<]+)',
webpage, 'description', default=None),
- 'thumbnail': self._html_search_regex(
- r'poster:\s*\'([^\']+)', player, 'thumbnail', default=None),
+ 'thumbnail': url_or_none(player_data.get('poster')),
'uploader': self._html_search_regex(
- r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', webpage, 'uploader', default=None),
+ r'<[^>]+\bitemprop=["\']director["\'][^>]*>([^<]+)', webpage, 'uploader', default=None),
'release_year': int_or_none(self._html_search_regex(
- r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
+ r'["\']nfb_version_year["\']\s*:\s*["\']([^"\']+)',
webpage, 'release_year', default=None)),
} if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)
diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py
index 9381c7eab..6c441ff34 100644
--- a/yt_dlp/extractor/patreon.py
+++ b/yt_dlp/extractor/patreon.py
@@ -219,7 +219,29 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': r're:^https?://.+',
},
'params': {'skip_download': 'm3u8'},
+ }, {
+ # multiple attachments/embeds
+ 'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977',
+ 'playlist_count': 3,
+ 'info_dict': {
+ 'id': '100601977',
+ 'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis',
+ 'description': 'md5:d099ab976edfce6de2a65c2b169a88d3',
+ 'uploader': 'Bradley Hall',
+ 'uploader_id': '24401883',
+ 'uploader_url': 'https://www.patreon.com/bradleyhallguitar',
+ 'channel_id': '3193932',
+ 'channel_url': 'https://www.patreon.com/bradleyhallguitar',
+ 'channel_follower_count': int,
+ 'timestamp': 1710777855,
+ 'upload_date': '20240318',
+ 'like_count': int,
+ 'comment_count': int,
+ 'thumbnail': r're:^https?://.+',
+ },
+ 'skip': 'Patron-only content',
}]
+ _RETURN_TYPE = 'video'
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -234,58 +256,54 @@ def _real_extract(self, url):
'include': 'audio,user,user_defined_tags,campaign,attachments_media',
})
attributes = post['data']['attributes']
- title = attributes['title'].strip()
- image = attributes.get('image') or {}
- info = {
- 'id': video_id,
- 'title': title,
- 'description': clean_html(attributes.get('content')),
- 'thumbnail': image.get('large_url') or image.get('url'),
- 'timestamp': parse_iso8601(attributes.get('published_at')),
- 'like_count': int_or_none(attributes.get('like_count')),
- 'comment_count': int_or_none(attributes.get('comment_count')),
- }
- can_view_post = traverse_obj(attributes, 'current_user_can_view')
- if can_view_post and info['comment_count']:
- info['__post_extractor'] = self.extract_comments(video_id)
+ info = traverse_obj(attributes, {
+ 'title': ('title', {str.strip}),
+ 'description': ('content', {clean_html}),
+ 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any),
+ 'timestamp': ('published_at', {parse_iso8601}),
+ 'like_count': ('like_count', {int_or_none}),
+ 'comment_count': ('comment_count', {int_or_none}),
+ })
- for i in post.get('included', []):
- i_type = i.get('type')
- if i_type == 'media':
- media_attributes = i.get('attributes') or {}
- download_url = media_attributes.get('download_url')
+ entries = []
+ idx = 0
+ for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
+ include_type = include['type']
+ if include_type == 'media':
+ media_attributes = traverse_obj(include, ('attributes', {dict})) or {}
+ download_url = url_or_none(media_attributes.get('download_url'))
ext = mimetype2ext(media_attributes.get('mimetype'))
# if size_bytes is None, this media file is likely unavailable
# See: https://github.com/yt-dlp/yt-dlp/issues/4608
size_bytes = int_or_none(media_attributes.get('size_bytes'))
if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None:
- # XXX: what happens if there are multiple attachments?
- return {
- **info,
+ idx += 1
+ entries.append({
+ 'id': f'{video_id}-{idx}',
'ext': ext,
'filesize': size_bytes,
'url': download_url,
- }
- elif i_type == 'user':
- user_attributes = i.get('attributes')
- if user_attributes:
- info.update({
- 'uploader': user_attributes.get('full_name'),
- 'uploader_id': str_or_none(i.get('id')),
- 'uploader_url': user_attributes.get('url'),
})
- elif i_type == 'post_tag':
- info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value')))
+ elif include_type == 'user':
+ info.update(traverse_obj(include, {
+ 'uploader': ('attributes', 'full_name', {str}),
+ 'uploader_id': ('id', {str_or_none}),
+ 'uploader_url': ('attributes', 'url', {url_or_none}),
+ }))
- elif i_type == 'campaign':
- info.update({
- 'channel': traverse_obj(i, ('attributes', 'title')),
- 'channel_id': str_or_none(i.get('id')),
- 'channel_url': traverse_obj(i, ('attributes', 'url')),
- 'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))),
- })
+ elif include_type == 'post_tag':
+ if post_tag := traverse_obj(include, ('attributes', 'value', {str})):
+ info.setdefault('tags', []).append(post_tag)
+
+ elif include_type == 'campaign':
+ info.update(traverse_obj(include, {
+ 'channel': ('attributes', 'title', {str}),
+ 'channel_id': ('id', {str_or_none}),
+ 'channel_url': ('attributes', 'url', {url_or_none}),
+ 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
+ }))
# handle Vimeo embeds
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
@@ -296,36 +314,50 @@ def _real_extract(self, url):
v_url, video_id, 'Checking Vimeo embed URL',
headers={'Referer': 'https://patreon.com/'},
fatal=False, errnote=False):
- return self.url_result(
+ entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
- VimeoIE, url_transparent=True, **info)
+ VimeoIE, url_transparent=True))
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
- return self.url_result(embed_url, **info)
+ entries.append(self.url_result(embed_url))
- post_file = traverse_obj(attributes, 'post_file')
+ post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file:
name = post_file.get('name')
ext = determine_ext(name)
if ext in KNOWN_EXTENSIONS:
- return {
- **info,
+ entries.append({
+ 'id': video_id,
'ext': ext,
'url': post_file['url'],
- }
+ })
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
- return {
- **info,
+ entries.append({
+ 'id': video_id,
'formats': formats,
'subtitles': subtitles,
- }
+ })
- if can_view_post is False:
+ can_view_post = traverse_obj(attributes, 'current_user_can_view')
+ comments = None
+ if can_view_post and info.get('comment_count'):
+ comments = self.extract_comments(video_id)
+
+ if not entries and can_view_post is False:
self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True)
- else:
+ elif not entries:
self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True)
+ elif len(entries) == 1:
+ info.update(entries[0])
+ else:
+ for entry in entries:
+ entry.update(info)
+ return self.playlist_result(entries, video_id, **info, __post_extractor=comments)
+
+ info['id'] = video_id
+ info['__post_extractor'] = comments
return info
def _get_comments(self, post_id):
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
index c9ed645eb..c9ca41a5c 100644
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -361,7 +361,7 @@ def extract_count(key):
'like_count': extract_count('favoritings') or extract_count('likes'),
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
- 'genre': info.get('genre'),
+ 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)),
'formats': formats if not extract_flat else None
}
@@ -395,10 +395,10 @@ class SoundcloudIE(SoundcloudBaseIE):
_TESTS = [
{
'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
- 'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
+ 'md5': 'de9bac153e7427a7333b4b0c1b6a18d2',
'info_dict': {
'id': '62986583',
- 'ext': 'mp3',
+ 'ext': 'opus',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music',
@@ -411,6 +411,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg',
+ 'uploader_url': 'https://soundcloud.com/ethmusic',
+ 'genres': [],
}
},
# geo-restricted
@@ -418,7 +421,7 @@ class SoundcloudIE(SoundcloudBaseIE):
'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
'info_dict': {
'id': '47127627',
- 'ext': 'mp3',
+ 'ext': 'opus',
'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
@@ -431,6 +434,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'uploader_url': 'https://soundcloud.com/the-concept-band',
+ 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg',
+ 'genres': ['Alternative'],
},
},
# private link
@@ -452,6 +458,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'uploader_url': 'https://soundcloud.com/jaimemf',
+ 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
+ 'genres': ['youtubedl'],
},
},
# private link (alt format)
@@ -473,6 +482,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'uploader_url': 'https://soundcloud.com/jaimemf',
+ 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
+ 'genres': ['youtubedl'],
},
},
# downloadable song
@@ -482,6 +494,21 @@ class SoundcloudIE(SoundcloudBaseIE):
'info_dict': {
'id': '343609555',
'ext': 'wav',
+ 'title': 'The Following',
+ 'description': '',
+ 'uploader': '80M',
+ 'uploader_id': '312384765',
+ 'uploader_url': 'https://soundcloud.com/the80m',
+ 'upload_date': '20170922',
+ 'timestamp': 1506120436,
+ 'duration': 397.228,
+ 'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg',
+ 'license': 'all-rights-reserved',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'view_count': int,
+ 'genres': ['Dance & EDM'],
},
},
# private link, downloadable format
@@ -503,6 +530,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg',
+ 'uploader_url': 'https://soundcloud.com/oriuplift',
+ 'genres': ['Trance'],
},
},
# no album art, use avatar pic for thumbnail
@@ -525,6 +555,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'uploader_url': 'https://soundcloud.com/garyvee',
+ 'genres': [],
},
'params': {
'skip_download': True,
@@ -532,13 +564,13 @@ class SoundcloudIE(SoundcloudBaseIE):
},
{
'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
- 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
+ 'md5': '8227c3473a4264df6b02ad7e5b7527ac',
'info_dict': {
'id': '583011102',
- 'ext': 'mp3',
+ 'ext': 'opus',
'title': 'Mezzo Valzer',
- 'description': 'md5:4138d582f81866a530317bae316e8b61',
- 'uploader': 'Micronie',
+ 'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a',
+ 'uploader': 'Giovanni Sarani',
'uploader_id': '3352531',
'timestamp': 1551394171,
'upload_date': '20190228',
@@ -549,6 +581,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
+ 'genres': ['Piano'],
+ 'uploader_url': 'https://soundcloud.com/giovannisarani',
},
},
{
diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py
index a445fae85..52ff230f2 100644
--- a/yt_dlp/extractor/tv5mondeplus.py
+++ b/yt_dlp/extractor/tv5mondeplus.py
@@ -2,85 +2,88 @@
from .common import InfoExtractor
from ..utils import (
+ clean_html,
determine_ext,
extract_attributes,
+ get_element_by_class,
+ get_element_html_by_class,
int_or_none,
- parse_duration,
- traverse_obj,
- try_get,
url_or_none,
)
+from ..utils.traversal import traverse_obj
class TV5MondePlusIE(InfoExtractor):
- IE_DESC = 'TV5MONDE+'
- _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P ]+class=["\']episode-emission[^>]+>([^<]+)', webpage,
- 'series', default=None)
-
- if series and series != title:
- title = '%s - %s' % (series, title)
-
- upload_date = self._search_regex(
- r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})',
- webpage, 'upload date', default=None)
- if upload_date:
- upload_date = upload_date.replace('_', '')
+ vpl_data.get('data-metadata') or '{}', display_id, fatal=False)
if not video_id:
video_id = self._search_regex(
@@ -175,16 +158,20 @@ def process_video_files(v):
default=display_id)
return {
+ **traverse_obj(metadata, ('content', {
+ 'id': ('id', {str}),
+ 'title': ('title', {str}),
+ 'episode': ('title', {str}),
+ 'series': ('series', {str}),
+ 'timestamp': ('publishDate_ts', {int_or_none}),
+ 'duration': ('duration', {int_or_none}),
+ })),
'id': video_id,
'display_id': display_id,
- 'title': title,
- 'description': description,
- 'thumbnail': vpl_data.get('data-image'),
- 'duration': duration,
- 'upload_date': upload_date,
+ 'title': clean_html(get_element_by_class('main-title', webpage)),
+ 'description': clean_html(get_element_by_class('text', get_element_html_by_class('ep-summary', webpage) or '')),
+ 'thumbnail': url_or_none(vpl_data.get('data-image')),
'formats': formats,
'subtitles': self._extract_subtitles(self._parse_json(
traverse_obj(vpl_data, ('data-captions', {str}), default='{}'), display_id, fatal=False)),
- 'series': series,
- 'episode': episode,
}
diff --git a/yt_dlp/extractor/tva.py b/yt_dlp/extractor/tva.py
index 9afe23328..e3e10557c 100644
--- a/yt_dlp/extractor/tva.py
+++ b/yt_dlp/extractor/tva.py
@@ -1,10 +1,9 @@
+import functools
+import re
+
from .common import InfoExtractor
-from ..utils import (
- float_or_none,
- int_or_none,
- smuggle_url,
- strip_or_none,
-)
+from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none
+from ..utils.traversal import traverse_obj
class TVAIE(InfoExtractor):
@@ -49,11 +48,20 @@ class QubIE(InfoExtractor):
'info_dict': {
'id': '6084352463001',
'ext': 'mp4',
- 'title': 'Épisode 01',
+ 'title': 'Ép 01. Mon dernier jour',
'uploader_id': '5481942443001',
'upload_date': '20190907',
'timestamp': 1567899756,
'description': 'md5:9c0d7fbb90939420c651fd977df90145',
+ 'thumbnail': r're:https://.+\.jpg',
+ 'episode': 'Ép 01. Mon dernier jour',
+ 'episode_number': 1,
+ 'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'],
+ 'duration': 2625.963,
+ 'season': 'Season 1',
+ 'season_number': 1,
+ 'series': 'Alerte Amber',
+ 'channel': 'TVA',
},
}, {
'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943',
@@ -64,22 +72,24 @@ class QubIE(InfoExtractor):
def _real_extract(self, url):
entity_id = self._match_id(url)
- entity = self._download_json(
- 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities',
- entity_id, query={'id': entity_id})
+ webpage = self._download_webpage(url, entity_id)
+ entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData']
video_id = entity['videoId']
episode = strip_or_none(entity.get('name'))
return {
'_type': 'url_transparent',
+ 'url': f'https://videos.tva.ca/details/_{video_id}',
+ 'ie_key': TVAIE.ie_key(),
'id': video_id,
'title': episode,
- # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'],
- 'url': 'https://videos.tva.ca/details/_' + video_id,
- 'description': entity.get('longDescription'),
- 'duration': float_or_none(entity.get('durationMillis'), 1000),
'episode': episode,
- 'episode_number': int_or_none(entity.get('episodeNumber')),
- # 'ie_key': 'BrightcoveNew',
- 'ie_key': TVAIE.ie_key(),
+ **traverse_obj(entity, {
+ 'description': ('longDescription', {str}),
+ 'duration': ('durationMillis', {functools.partial(float_or_none, scale=1000)}),
+ 'channel': ('knownEntities', 'channel', 'name', {str}),
+ 'series': ('knownEntities', 'videoShow', 'name', {str}),
+ 'season_number': ('slug', {lambda x: re.search(r'/s(?:ai|ea)son-(\d+)/', x)}, 1, {int_or_none}),
+ 'episode_number': ('episodeNumber', {int_or_none}),
+ }),
}
diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py
index 7e3a3a9a9..28d502685 100644
--- a/yt_dlp/extractor/vk.py
+++ b/yt_dlp/extractor/vk.py
@@ -451,6 +451,7 @@ def _real_extract(self, url):
info_page, 'view count', default=None))
formats = []
+ subtitles = {}
for format_id, format_url in data.items():
format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
@@ -462,12 +463,21 @@ def _real_extract(self, url):
formats.append({
'format_id': format_id,
'url': format_url,
+ 'ext': 'mp4',
+ 'source_preference': 1,
'height': height,
})
elif format_id == 'hls':
- formats.extend(self._extract_m3u8_formats(
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id, fatal=False, live=is_live))
+ m3u8_id=format_id, fatal=False, live=is_live)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+ elif format_id.startswith('dash_'):
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ format_url, video_id, mpd_id=format_id, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
elif format_id == 'rtmp':
formats.append({
'format_id': format_id,
@@ -475,7 +485,6 @@ def _real_extract(self, url):
'ext': 'flv',
})
- subtitles = {}
for sub in data.get('subs') or {}:
subtitles.setdefault(sub.get('lang', 'en'), []).append({
'ext': sub.get('title', '.srt').split('.')[-1],
@@ -496,6 +505,7 @@ def _real_extract(self, url):
'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live,
'subtitles': subtitles,
+ '_format_sort_fields': ('res', 'source'),
}
diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py
index 880ee519b..d401d6d39 100644
--- a/yt_dlp/extractor/wrestleuniverse.py
+++ b/yt_dlp/extractor/wrestleuniverse.py
@@ -12,6 +12,7 @@
jwt_decode_hs256,
traverse_obj,
try_call,
+ url_basename,
url_or_none,
urlencode_postdata,
variadic,
@@ -194,8 +195,7 @@ def _real_extract(self, url):
return {
'id': video_id,
- 'formats': self._get_formats(video_data, (
- (('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id),
+ 'formats': self._get_formats(video_data, ('protocolHls', 'url', {url_or_none}), video_id),
**traverse_obj(metadata, {
'title': ('displayName', {str}),
'description': ('description', {str}),
@@ -259,6 +259,10 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE):
'params': {
'skip_download': 'm3u8',
},
+ }, {
+ 'note': 'manifest provides live-a (partial) and live-b (full) streams',
+ 'url': 'https://www.wrestle-universe.com/en/lives/umc99R9XsexXrxr9VjTo9g',
+ 'only_matching': True,
}]
_API_PATH = 'events'
@@ -285,12 +289,16 @@ def _real_extract(self, url):
video_data, decrypt = self._call_encrypted_api(
video_id, ':watchArchive', 'watch archive', data={'method': 1})
- info['formats'] = self._get_formats(video_data, (
- ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id)
+ # 'chromecastUrls' can be only partial videos, avoid
+ info['formats'] = self._get_formats(video_data, ('hls', (('urls', ...), 'url'), {url_or_none}), video_id)
for f in info['formats']:
# bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values
if f.get('tbr'):
f['tbr'] = int(f['tbr'] / 2.5)
+ # prefer variants with the same basename as the master playlist to avoid partial streams
+ f['format_id'] = url_basename(f['url']).partition('.')[0]
+ if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]):
+ f['preference'] = -10
hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt}))
if hls_aes_key:
diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py
index 4382a5684..95a9446e3 100644
--- a/yt_dlp/extractor/yandexvideo.py
+++ b/yt_dlp/extractor/yandexvideo.py
@@ -259,15 +259,15 @@ def _real_extract(self, url):
webpage = self._download_webpage(redirect, video_id, note='Redirecting')
data_json = self._search_json(
r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}')
- serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)',
- webpage, 'server state').replace('State', 'Settings')
+ serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state')
uploader = self._search_regex(r'(]+>)',
webpage, 'uploader', default='')
uploader_name = extract_attributes(uploader).get('aria-label')
- video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict)
- stream_urls = try_get(video_json, lambda x: x['video']['streams'])
+ item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str}))
+ video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {}
+
formats, subtitles = [], {}
- for s_url in stream_urls:
+ for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})):
ext = determine_ext(s_url)
if ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash')
diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py
index 6ee0abcae..6d4e31bf3 100644
--- a/yt_dlp/extractor/youporn.py
+++ b/yt_dlp/extractor/youporn.py
@@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor):
'id': '16290308',
'age_limit': 18,
'categories': [],
- 'description': 'md5:00ea70f642f431c379763c17c2f396bc',
+ 'description': str, # TODO: detect/remove SEO spam description in ytdl backport
'display_id': 'tinderspecial-trailer1',
'duration': 298.0,
'ext': 'mp4',
'upload_date': '20201123',
'uploader': 'Ersties',
'tags': [],
- 'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg',
- 'timestamp': 1606089600,
+ 'thumbnail': r're:https://.+\.jpg',
+ 'timestamp': 1606147564,
'title': 'Tinder In Real Life',
'view_count': int,
}
@@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor):
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
- definitions = self._download_json(
- f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id)
+ self._set_cookie('.youporn.com', 'age_verified', '1')
+ webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id)
+ definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions']
- def get_format_data(data, f):
- return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl']))
+ def get_format_data(data, stream_type):
+ info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any))
+ if not info_url:
+ return []
+ return traverse_obj(
+ self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False),
+ lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl']))
formats = []
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
@@ -123,10 +129,6 @@ def get_format_data(data, f):
f['height'] = height
formats.append(f)
- webpage = self._download_webpage(
- 'http://www.youporn.com/watch/%s' % video_id, display_id,
- headers={'Cookie': 'age_verified=1'})
-
title = self._html_search_regex(
r'(?s)([^<]+)', webpage, 'title')
vpl_data = extract_attributes(self._search_regex(
r'(<[^>]+class="video_player_loader"[^>]+>)',
webpage, 'video player loader'))
@@ -147,26 +149,7 @@ def process_video_files(v):
process_video_files(video_files)
metadata = self._parse_json(
- vpl_data['data-metadata'], display_id)
- duration = (int_or_none(try_get(metadata, lambda x: x['content']['duration']))
- or parse_duration(self._html_search_meta('duration', webpage)))
-
- description = self._html_search_regex(
- r'(?s)