diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
index 352de83ca..83e732189 100644
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -1872,6 +1872,11 @@
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
+from .thisvid import (
+ ThisVidIE,
+ ThisVidMemberIE,
+ ThisVidPlaylistIE,
+)
from .threespeak import (
ThreeSpeakIE,
ThreeSpeakUserIE,
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index f48b97a6b..21d5c39fd 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1396,10 +1396,16 @@ def _rta_search(html):
# And then there are the jokers who advertise that they use RTA, but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled RTA',
+ r'>[^<]*you acknowledge you are at least (\d+) years old',
+ r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
]
- if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
- return 18
- return 0
+
+ age_limit = 0
+ for marker in AGE_LIMIT_MARKERS:
+ mobj = re.search(marker, html)
+ if mobj:
+ age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
+ return age_limit
def _media_rating_search(self, html):
# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
@@ -3216,7 +3222,7 @@ def manifest_url(manifest):
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'(?s)jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)(?!).*?\.setup\s*\((?P[^)]+)\)',
+ r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''',
webpage)
if mobj:
try:
@@ -3237,19 +3243,20 @@ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- # JWPlayer backward compatibility: flattened playlists
- # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
- if 'playlist' not in jwplayer_data:
- jwplayer_data = {'playlist': [jwplayer_data]}
-
entries = []
+ if not isinstance(jwplayer_data, dict):
+ return entries
- # JWPlayer backward compatibility: single playlist item
+ playlist_items = jwplayer_data.get('playlist')
+ # JWPlayer backward compatibility: single playlist item/flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
- if not isinstance(jwplayer_data['playlist'], list):
- jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+ if not isinstance(playlist_items, list):
+ playlist_items = (playlist_items or jwplayer_data, )
- for video_data in jwplayer_data['playlist']:
+ for video_data in playlist_items:
+ if not isinstance(video_data, dict):
+ continue
# JWPlayer backward compatibility: flattened sources
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
if 'sources' not in video_data:
@@ -3287,6 +3294,13 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
+ 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
+ 'genre': clean_html(video_data.get('genre')),
+ 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
+ 'season_number': int_or_none(video_data.get('season')),
+ 'episode_number': int_or_none(video_data.get('episode')),
+ 'release_year': int_or_none(video_data.get('releasedate')),
+ 'age_limit': int_or_none(video_data.get('age_restriction')),
}
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
@@ -3304,7 +3318,7 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- urls = []
+ urls = set()
formats = []
for source in jwplayer_sources_data:
if not isinstance(source, dict):
@@ -3313,14 +3327,14 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
base_url, self._proto_relative_url(source.get('file')))
if not source_url or source_url in urls:
continue
- urls.append(source_url)
+ urls.add(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
- if source_type == 'hls' or ext == 'm3u8':
+ if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False))
- elif source_type == 'dash' or ext == 'mpd':
+ elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil':
@@ -3335,13 +3349,12 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
'ext': ext,
})
else:
+ format_id = str_or_none(source.get('label'))
height = int_or_none(source.get('height'))
- if height is None:
+ if height is None and format_id:
# Often no height is provided but there is a label in
# format like "1080p", "720p SD", or 1080.
- height = int_or_none(self._search_regex(
- r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
- 'height', default=None))
+ height = parse_resolution(format_id).get('height')
a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
@@ -3349,6 +3362,7 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
'tbr': int_or_none(source.get('bitrate'), scale=1000),
'filesize': int_or_none(source.get('filesize')),
'ext': ext,
+ 'format_id': format_id
}
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index ffc279023..14d492f07 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -32,6 +32,7 @@
unified_timestamp,
unsmuggle_url,
url_or_none,
+ urljoin,
variadic,
xpath_attr,
xpath_text,
@@ -1867,11 +1868,13 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
- 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ 'description': 'Kelis - 4th Of July',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
+ 'expected_warnings': ['Untested major version'],
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/embed/105/',
@@ -1880,35 +1883,12 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
- 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
}, {
- # KVS Player
- 'url': 'https://thisvid.com/videos/french-boy-pantsed/',
- 'md5': '3397979512c682f6b85b3b04989df224',
- 'info_dict': {
- 'id': '2400174',
- 'display_id': 'french-boy-pantsed',
- 'ext': 'mp4',
- 'title': 'French Boy Pantsed - ThisVid.com',
- 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
- }
- }, {
- # KVS Player
- 'url': 'https://thisvid.com/embed/2400174/',
- 'md5': '3397979512c682f6b85b3b04989df224',
- 'info_dict': {
- 'id': '2400174',
- 'display_id': 'french-boy-pantsed',
- 'ext': 'mp4',
- 'title': 'French Boy Pantsed - ThisVid.com',
- 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
- }
- }, {
- # KVS Player
'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
@@ -1916,8 +1896,8 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
- 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
- }
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
}, {
# KVS Player
'url': 'https://youix.com/embed/18485',
@@ -1927,19 +1907,20 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Ленинград - ЗОЖ',
- 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
- }
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
}, {
# KVS Player
'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': {
'id': '21217',
- 'display_id': '40-nochey-40-nights-2016',
+ 'display_id': '40-nochey-2016',
'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org',
+ 'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
- }
+ },
},
{
# KVS Player (for sites that serve kt_player.js via non-https urls)
@@ -1949,9 +1930,9 @@ class GenericIE(InfoExtractor):
'id': '389508',
'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
'ext': 'mp4',
- 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
- 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg',
- }
+ 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+ 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
+ },
},
{
# Reddit-hosted video that will redirect and be processed by RedditIE
@@ -2169,7 +2150,20 @@ class GenericIE(InfoExtractor):
'direct': True,
'age_limit': 0,
}
- }
+ },
+ {
+ 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
+ 'md5': 'e2f0a4c329f7986280b7328e24036d60',
+ 'info_dict': {
+ 'id': '284002',
+ 'display_id': 'just-out-of-the-shower-joi',
+ 'ext': 'mp4',
+ 'title': 'Just Out Of The Shower JOI - Shooshtime',
+ 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg',
+ 'height': 720,
+ 'age_limit': 18,
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -2235,43 +2229,87 @@ def itunes(key):
'entries': entries,
}
- def _kvs_getrealurl(self, video_url, license_code):
+ @classmethod
+ def _kvs_get_real_url(cls, video_url, license_code):
if not video_url.startswith('function/0/'):
return video_url # not obfuscated
- url_path, _, url_query = video_url.partition('?')
- urlparts = url_path.split('/')[2:]
- license = self._kvs_getlicensetoken(license_code)
- newmagic = urlparts[5][:32]
+ parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
+ license = cls._kvs_get_license_token(license_code)
+ urlparts = parsed.path.split('/')
- for o in range(len(newmagic) - 1, -1, -1):
- new = ''
- l = (o + sum(int(n) for n in license[o:])) % 32
+ HASH_LENGTH = 32
+ hash = urlparts[3][:HASH_LENGTH]
+ indices = list(range(HASH_LENGTH))
- for i in range(0, len(newmagic)):
- if i == o:
- new += newmagic[l]
- elif i == l:
- new += newmagic[o]
- else:
- new += newmagic[i]
- newmagic = new
+ # Swap indices of hash according to the destination calculated from the license token
+ accum = 0
+ for src in reversed(range(HASH_LENGTH)):
+ accum += license[src]
+ dest = (src + accum) % HASH_LENGTH
+ indices[src], indices[dest] = indices[dest], indices[src]
- urlparts[5] = newmagic + urlparts[5][32:]
- return '/'.join(urlparts) + '?' + url_query
+ urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
+ return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
- def _kvs_getlicensetoken(self, license):
- modlicense = license.replace('$', '').replace('0', '1')
- center = int(len(modlicense) / 2)
+ @staticmethod
+ def _kvs_get_license_token(license):
+ license = license.replace('$', '')
+ license_values = [int(char) for char in license]
+
+ modlicense = license.replace('0', '1')
+ center = len(modlicense) // 2
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
+ modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
- modlicense = str(4 * abs(fronthalf - backhalf))
- retval = ''
- for o in range(0, center + 1):
- for i in range(1, 5):
- retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
- return retval
+ return [
+ (license_values[index + offset] + current) % 10
+ for index, current in enumerate(map(int, modlicense))
+ for offset in range(4)
+ ]
+
+ def _extract_kvs(self, url, webpage, video_id):
+ flashvars = self._search_json(
+ r'(?s:', webpage)
- flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
-
- # extract the part after the last / as the display_id from the
- # canonical URL.
- display_id = self._search_regex(
- r'(?:'
- r'|)',
- webpage, 'display_id', fatal=False
- )
- title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)(?:h1|title)>', webpage, 'title')
-
- thumbnail = flashvars['preview_url']
- if thumbnail.startswith('//'):
- protocol, _, _ = url.partition('/')
- thumbnail = protocol + thumbnail
-
- url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
- formats = []
- for key in url_keys:
- if '/get_file/' not in flashvars[key]:
- continue
- format_id = flashvars.get(f'{key}_text', key)
- formats.append({
- 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
- 'format_id': format_id,
- 'ext': 'mp4',
- **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
- })
- if not formats[-1].get('height'):
- formats[-1]['quality'] = 1
-
- return [{
- 'id': flashvars['video_id'],
- 'display_id': display_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }]
if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py
index 2d9b9a742..d1fc058b9 100644
--- a/yt_dlp/extractor/peekvids.py
+++ b/yt_dlp/extractor/peekvids.py
@@ -1,71 +1,128 @@
+import re
+
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ get_element_by_class,
+ int_or_none,
+ merge_dicts,
+ url_or_none,
+)
-class PeekVidsIE(InfoExtractor):
+class PeekVidsBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ domain, video_id = self._match_valid_url(url).group('domain', 'id')
+ webpage = self._download_webpage(url, video_id, expected_status=429)
+ if '>Rate Limit Exceeded' in webpage:
+ raise ExtractorError(
+ f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}',
+ video_id=video_id, expected=True)
+
+ title = self._html_search_regex(r'(?s)]*>(.+?)
', webpage, 'title')
+
+ display_id = video_id
+ video_id = self._search_regex(r'(?s)