Merge branch 'yt-dlp:master' into rls/arm-ubuntu-bump

This commit is contained in:
bashonly 2024-03-04 11:01:23 -06:00
commit 23ad448446
No known key found for this signature in database
GPG key ID: 783F096F253D15B0
23 changed files with 693 additions and 319 deletions

View file

@ -192,8 +192,8 @@ def test_raise_http_error(self, handler, status):
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('params,extensions', [
({'timeout': 0.00001}, {}),
({}, {'timeout': 0.00001}),
({'timeout': sys.float_info.min}, {}),
({}, {'timeout': sys.float_info.min}),
])
def test_timeout(self, handler, params, extensions):
with handler(**params) as rh:

View file

@ -690,7 +690,6 @@ def process_color_policy(stream):
self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers'))
self._load_cookies(self.params['http_headers'].get('Cookie')) # compat
self.params['http_headers'].pop('Cookie', None)
self._request_director = self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
if auto_init and auto_init != 'no_verbose_header':
self.print_debug_header()
@ -964,6 +963,7 @@ def __exit__(self, *args):
def close(self):
self.save_cookies()
self._request_director.close()
del self._request_director
def trouble(self, message=None, tb=None, is_error=True):
"""Determine action to take when a download problem appears.
@ -4160,6 +4160,10 @@ def build_request_director(self, handlers, preferences=None):
director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0)
return director
@functools.cached_property
def _request_director(self):
return self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
def encode(self, s):
if isinstance(s, bytes):
return s # Already encoded

View file

@ -444,6 +444,7 @@
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
DailymotionSearchIE,
DailymotionUserIE,
)
from .dailywire import (
@ -2499,6 +2500,7 @@
Zee5SeriesIE,
)
from .zeenews import ZeeNewsIE
from .zenporn import ZenPornIE
from .zetland import ZetlandDKArticleIE
from .zhihu import ZhihuIE
from .zingmp3 import (

View file

@ -22,7 +22,7 @@ class AltCensoredIE(InfoExtractor):
'title': "QUELLES SONT LES CONSÉQUENCES DE L'HYPERSEXUALISATION DE LA SOCIÉTÉ ?",
'display_id': 'k0srjLSkga8.webm',
'release_date': '20180403',
'creator': 'Virginie Vota',
'creators': ['Virginie Vota'],
'release_year': 2018,
'upload_date': '20230318',
'uploader': 'admin@altcensored.com',
@ -32,7 +32,7 @@ class AltCensoredIE(InfoExtractor):
'duration': 926.09,
'thumbnail': 'https://archive.org/download/youtube-k0srjLSkga8/youtube-k0srjLSkga8.thumbs/k0srjLSkga8_000925.jpg',
'view_count': int,
'categories': ['News & Politics'],
'categories': ['News & Politics'], # FIXME
}
}]
@ -62,14 +62,21 @@ class AltCensoredChannelIE(InfoExtractor):
'title': 'Virginie Vota',
'id': 'UCFPTO55xxHqFqkzRZHu4kcw',
},
'playlist_count': 91
'playlist_count': 85,
}, {
'url': 'https://altcensored.com/channel/UC9CcJ96HKMWn0LZlcxlpFTw',
'info_dict': {
'title': 'yukikaze775',
'id': 'UC9CcJ96HKMWn0LZlcxlpFTw',
},
'playlist_count': 4
'playlist_count': 4,
}, {
'url': 'https://altcensored.com/channel/UCfYbb7nga6-icsFWWgS-kWw',
'info_dict': {
'title': 'Mister Metokur',
'id': 'UCfYbb7nga6-icsFWWgS-kWw',
},
'playlist_count': 121,
}]
def _real_extract(self, url):
@ -78,7 +85,7 @@ def _real_extract(self, url):
url, channel_id, 'Download channel webpage', 'Unable to get channel webpage')
title = self._html_search_meta('altcen_title', webpage, 'title', fatal=False)
page_count = int_or_none(self._html_search_regex(
r'<a[^>]+href="/channel/\w+/page/(\d+)">(?:\1)</a>',
r'<a[^>]+href="/channel/[\w-]+/page/(\d+)">(?:\1)</a>',
webpage, 'page count', default='1'))
def page_func(page_num):

View file

@ -300,7 +300,7 @@ def _real_extract(self, url):
is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
entry['formats'].append({
'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
'url': 'https://archive.org/download/' + identifier + '/' + urllib.parse.quote(f['name']),
'format': f.get('format'),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),

View file

@ -1996,7 +1996,7 @@ def _extract_video_metadata(self, url, video_id, season_id):
'title': get_element_by_class(
'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage),
'description': get_element_by_class(
'bstar-meta__desc', webpage) or self._html_search_meta('og:description'),
'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage),
}, self._search_json_ld(webpage, video_id, default={}))
def _get_comments_reply(self, root_id, next_id=0, display_id=None):

View file

@ -88,6 +88,20 @@ class CCTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
}, {
# videoCenterId: "id"
'url': 'http://news.cctv.com/2024/02/21/ARTIcU5tKIOIF2myEGCATkLo240221.shtml',
'info_dict': {
'id': '5c846c0518444308ba32c4159df3b3e0',
'ext': 'mp4',
'title': '《平“语”近人——习近平喜欢的典故》第三季 第5集风物长宜放眼量',
'uploader': 'yangjuan',
'timestamp': 1708554940,
'upload_date': '20240221',
},
'params': {
'skip_download': True,
},
}, {
# var ids = ["id"]
'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml',
@ -128,7 +142,7 @@ def _real_extract(self, url):
video_id = self._search_regex(
[r'var\s+guid\s*=\s*["\']([\da-fA-F]+)',
r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)',
r'videoCenterId(?:["\']\s*,|:)\s*["\']([\da-fA-F]+)',
r'changePlayer\s*\(\s*["\']([\da-fA-F]+)',
r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)',
r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)',

View file

@ -2,7 +2,7 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
UserNotLive,
float_or_none,
int_or_none,
parse_iso8601,
@ -40,7 +40,7 @@ def _real_extract(self, url):
note='Downloading channel info', errnote='Unable to download channel info')['content']
if live_detail.get('status') == 'CLOSE':
raise ExtractorError('The channel is not currently live', expected=True)
raise UserNotLive(video_id=channel_id)
live_playback = self._parse_json(live_detail['livePlaybackJson'], channel_id)

View file

@ -4,27 +4,25 @@
class CloudflareStreamIE(InfoExtractor):
_SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?'
_DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
_EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE
_EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo='
_ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+'
_VALID_URL = r'''(?x)
https?://
(?:
(?:watch\.)?%s/|
%s
)
(?P<id>%s)
''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
_EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
_EMBED_REGEX = [
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1',
rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
]
_TESTS = [{
'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
'info_dict': {
'id': '31c9291ab41fac05471db4e73aa11717',
'ext': 'mp4',
'title': '31c9291ab41fac05471db4e73aa11717',
'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': True,
'skip_download': 'm3u8',
},
}, {
'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
@ -35,6 +33,21 @@ class CloudflareStreamIE(InfoExtractor):
}, {
'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e',
'only_matching': True,
}, {
'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'https://upride.cc/incident/shoulder-pass-at-light/',
'info_dict': {
'id': 'eaef9dea5159cf968be84241b5cedfe7',
'ext': 'mp4',
'title': 'eaef9dea5159cf968be84241b5cedfe7',
'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
},
}]
def _real_extract(self, url):

View file

@ -1,6 +1,7 @@
import functools
import json
import re
import urllib.parse
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
@ -44,36 +45,41 @@ def _real_initialize(self):
self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self.get_param('age_limit'))
self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off')
def _get_token(self, xid):
cookies = self._get_dailymotion_cookies()
token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token')
if token:
return token
data = {
'client_id': 'f1a362d288c1b98099c7',
'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
}
username, password = self._get_login_info()
if username:
data.update({
'grant_type': 'password',
'password': password,
'username': username,
})
else:
data['grant_type'] = 'client_credentials'
try:
token = self._download_json(
'https://graphql.api.dailymotion.com/oauth/token',
None, 'Downloading Access Token',
data=urlencode_postdata(data))['access_token']
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise ExtractorError(self._parse_json(
e.cause.response.read().decode(), xid)['error_description'], expected=True)
raise
self._set_dailymotion_cookie('access_token' if username else 'client_token', token)
return token
def _call_api(self, object_type, xid, object_fields, note, filter_extra=None):
if not self._HEADERS.get('Authorization'):
cookies = self._get_dailymotion_cookies()
token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token')
if not token:
data = {
'client_id': 'f1a362d288c1b98099c7',
'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
}
username, password = self._get_login_info()
if username:
data.update({
'grant_type': 'password',
'password': password,
'username': username,
})
else:
data['grant_type'] = 'client_credentials'
try:
token = self._download_json(
'https://graphql.api.dailymotion.com/oauth/token',
None, 'Downloading Access Token',
data=urlencode_postdata(data))['access_token']
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise ExtractorError(self._parse_json(
e.cause.response.read().decode(), xid)['error_description'], expected=True)
raise
self._set_dailymotion_cookie('access_token' if username else 'client_token', token)
self._HEADERS['Authorization'] = 'Bearer ' + token
self._HEADERS['Authorization'] = f'Bearer {self._get_token(xid)}'
resp = self._download_json(
'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({
@ -393,9 +399,55 @@ def _extract_embed_urls(cls, url, webpage):
yield '//dailymotion.com/playlist/%s' % p
class DailymotionSearchIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:search'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/search/(?P<id>[^/?#]+)/videos'
_PAGE_SIZE = 20
_TESTS = [{
'url': 'http://www.dailymotion.com/search/king of turtles/videos',
'info_dict': {
'id': 'king of turtles',
'title': 'king of turtles',
},
'playlist_mincount': 90,
}]
_SEARCH_QUERY = 'query SEARCH_QUERY( $query: String! $page: Int $limit: Int ) { search { videos( query: $query first: $limit page: $page ) { edges { node { xid } } } } } '
def _call_search_api(self, term, page, note):
if not self._HEADERS.get('Authorization'):
self._HEADERS['Authorization'] = f'Bearer {self._get_token(term)}'
resp = self._download_json(
'https://graphql.api.dailymotion.com/', None, note, data=json.dumps({
'operationName': 'SEARCH_QUERY',
'query': self._SEARCH_QUERY,
'variables': {
'limit': 20,
'page': page,
'query': term,
}
}).encode(), headers=self._HEADERS)
obj = traverse_obj(resp, ('data', 'search', {dict}))
if not obj:
raise ExtractorError(
traverse_obj(resp, ('errors', 0, 'message', {str})) or 'Could not fetch search data')
return obj
def _fetch_page(self, term, page):
page += 1
response = self._call_search_api(term, page, f'Searching "{term}" page {page}')
for xid in traverse_obj(response, ('videos', 'edges', ..., 'node', 'xid')):
yield self.url_result(f'https://www.dailymotion.com/video/{xid}', DailymotionIE, xid)
def _real_extract(self, url):
term = urllib.parse.unquote_plus(self._match_id(url))
return self.playlist_result(
OnDemandPagedList(functools.partial(self._fetch_page, term), self._PAGE_SIZE), term, term)
class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<id>[^/]+)'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {

View file

@ -8,9 +8,9 @@
class DumpertIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl(?:
/(?:mediabase|embed|item)/|
(?:/toppers|/latest|/?)\?selectedId=
(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:
(?:mediabase|embed|item)/|
[^#]*[?&]selectedId=
)(?P<id>[0-9]+[/_][0-9a-zA-Z]+)'''
_TESTS = [{
'url': 'https://www.dumpert.nl/item/6646981_951bc60f',
@ -56,6 +56,9 @@ class DumpertIE(InfoExtractor):
}, {
'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185',
'only_matching': True,
}, {
'url': 'https://www.dumpert.nl/toppers/dag?selectedId=100086074_f5cef3ac',
'only_matching': True,
}]
def _real_extract(self, url):

View file

@ -1,60 +1,49 @@
import re
import urllib.parse
from .common import InfoExtractor
from .dailymotion import DailymotionIE
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
determine_ext,
filter_dict,
format_field,
int_or_none,
join_nonempty,
parse_iso8601,
parse_qs,
smuggle_url,
unsmuggle_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class FranceTVBaseInfoExtractor(InfoExtractor):
def _make_url_result(self, video_or_full_id, catalog=None):
full_id = 'francetv:%s' % video_or_full_id
if '@' not in video_or_full_id and catalog:
full_id += '@%s' % catalog
return self.url_result(
full_id, ie=FranceTVIE.ie_key(),
video_id=video_or_full_id.split('@')[0])
def _make_url_result(self, video_id, url=None):
video_id = video_id.split('@')[0] # for compat with old @catalog IDs
full_id = f'francetv:{video_id}'
if url:
full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname})
return self.url_result(full_id, FranceTVIE, video_id)
class FranceTVIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
https?://
sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\?
.*?\bidDiffusion=[^&]+|
(?:
https?://videos\.francetv\.fr/video/|
francetv:
)
(?P<id>[^@]+)(?:@(?P<catalog>.+))?
)
'''
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
_VALID_URL = r'francetv:(?P<id>[^@#]+)'
_GEO_COUNTRIES = ['FR']
_GEO_BYPASS = False
_TESTS = [{
# without catalog
'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0',
'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f',
'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1',
'info_dict': {
'id': '162311093',
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus',
'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
'timestamp': 1502623500,
'duration': 2580,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170813',
},
}, {
# with catalog
'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4',
'only_matching': True,
}, {
'url': 'http://videos.francetv.fr/video/NI_657393@Regions',
'only_matching': True,
'params': {'skip_download': 'm3u8'},
}, {
'url': 'francetv:162311093',
'only_matching': True,
@ -76,10 +65,7 @@ class FranceTVIE(InfoExtractor):
'only_matching': True,
}]
def _extract_video(self, video_id, catalogue=None):
# Videos are identified by idDiffusion so catalogue part is optional.
# However when provided, some extra formats may be returned so we pass
# it if available.
def _extract_video(self, video_id, hostname=None):
is_live = None
videos = []
title = None
@ -91,18 +77,20 @@ def _extract_video(self, video_id, catalogue=None):
timestamp = None
spritesheets = None
for device_type in ('desktop', 'mobile'):
# desktop+chrome returns dash; mobile+safari returns hls
for device_type, browser in [('desktop', 'chrome'), ('mobile', 'safari')]:
dinfo = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
video_id, 'Downloading %s video JSON' % device_type, query={
f'https://k7.ftven.fr/videos/{video_id}', video_id,
f'Downloading {device_type} {browser} video JSON', query=filter_dict({
'device_type': device_type,
'browser': 'chrome',
}, fatal=False)
'browser': browser,
'domain': hostname,
}), fatal=False)
if not dinfo:
continue
video = dinfo.get('video')
video = traverse_obj(dinfo, ('video', {dict}))
if video:
videos.append(video)
if duration is None:
@ -112,7 +100,7 @@ def _extract_video(self, video_id, catalogue=None):
if spritesheets is None:
spritesheets = video.get('spritesheets')
meta = dinfo.get('meta')
meta = traverse_obj(dinfo, ('meta', {dict}))
if meta:
if title is None:
title = meta.get('title')
@ -126,43 +114,47 @@ def _extract_video(self, video_id, catalogue=None):
if timestamp is None:
timestamp = parse_iso8601(meta.get('broadcasted_at'))
formats = []
subtitles = {}
for video in videos:
formats, subtitles, video_url = [], {}, None
for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
video_url = video['url']
format_id = video.get('format')
video_url = None
if video.get('workflow') == 'token-akamai':
token_url = video.get('token')
if token_url:
token_json = self._download_json(
token_url, video_id,
'Downloading signed %s manifest URL' % format_id)
if token_json:
video_url = token_json.get('url')
if not video_url:
video_url = video.get('url')
token_url = url_or_none(video.get('token'))
if token_url and video.get('workflow') == 'token-akamai':
tokenized_url = traverse_obj(self._download_json(
token_url, video_id, f'Downloading signed {format_id} manifest URL',
fatal=False, query={
'format': 'json',
'url': video_url,
}), ('url', {url_or_none}))
if tokenized_url:
video_url = tokenized_url
ext = determine_ext(video_url)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id=format_id, fatal=False))
video_url, video_id, f4m_id=format_id or ext, fatal=False))
elif ext == 'm3u8':
format_id = format_id or 'hls'
fmts, subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False)
video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
for f in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none' and v.get('tbr') is None):
if mobj := re.match(rf'{format_id}-[Aa]udio-\w+-(?P<bitrate>\d+)', f['format_id']):
f.update({
'tbr': int_or_none(mobj.group('bitrate')),
'acodec': 'mp4a',
})
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(
video_url, video_id, mpd_id=format_id, fatal=False)
video_url, video_id, mpd_id=format_id or 'dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
'format_id': 'rtmp-%s' % format_id,
'format_id': join_nonempty('rtmp', format_id),
'ext': 'flv',
})
else:
@ -174,6 +166,13 @@ def _extract_video(self, video_id, catalogue=None):
# XXX: what is video['captions']?
if not formats and video_url:
urlh = self._request_webpage(
HEADRequest(video_url), video_id, 'Checking for geo-restriction',
fatal=False, expected_status=403)
if urlh and urlh.headers.get('x-errortype') == 'geo':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
for f in formats:
if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'):
f['language_preference'] = -10
@ -194,7 +193,7 @@ def _extract_video(self, video_id, catalogue=None):
# a 10×10 grid of thumbnails corresponding to approximately
# 2 seconds of the video; the last spritesheet may be shorter
'duration': 200,
} for sheet in spritesheets]
} for sheet in traverse_obj(spritesheets, (..., {url_or_none}))]
})
return {
@ -210,21 +209,15 @@ def _extract_video(self, video_id, catalogue=None):
'series': title if episode_number else None,
'episode_number': int_or_none(episode_number),
'season_number': int_or_none(season_number),
'_format_sort_fields': ('res', 'tbr', 'proto'), # prioritize m3u8 over dash
}
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
catalog = mobj.group('catalog')
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
hostname = smuggled_data.get('hostname') or 'www.france.tv'
if not video_id:
qs = parse_qs(url)
video_id = qs.get('idDiffusion', [None])[0]
catalog = qs.get('catalogue', [None])[0]
if not video_id:
raise ExtractorError('Invalid URL', expected=True)
return self._extract_video(video_id, catalog)
return self._extract_video(video_id, hostname=hostname)
class FranceTVSiteIE(FranceTVBaseInfoExtractor):
@ -246,6 +239,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
},
'add_ie': [FranceTVIE.ie_key()],
}, {
# geo-restricted
'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
'info_dict': {
'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44',
@ -304,17 +298,16 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id)
catalogue = None
video_id = self._search_regex(
r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'video id', default=None, group='id')
if not video_id:
video_id, catalogue = self._html_search_regex(
r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
webpage, 'video ID').split('@')
video_id = self._html_search_regex(
r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@"]+@[^"]+)"',
webpage, 'video ID')
return self._make_url_result(video_id, catalogue)
return self._make_url_result(video_id, url=url)
class FranceTVInfoIE(FranceTVBaseInfoExtractor):
@ -328,8 +321,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'ext': 'mp4',
'title': 'Soir 3',
'upload_date': '20190822',
'timestamp': 1566510900,
'description': 'md5:72d167097237701d6e8452ff03b83c00',
'timestamp': 1566510730,
'thumbnail': r're:^https?://.*\.jpe?g$',
'duration': 1637,
'subtitles': {
'fr': 'mincount:2',
},
@ -344,8 +338,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'info_dict': {
'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482',
'ext': 'mp4',
'title': 'Covid-19 : une situation catastrophique à New Dehli',
'thumbnail': str,
'title': 'Covid-19 : une situation catastrophique à New Dehli - Édition du mercredi 21 avril 2021',
'thumbnail': r're:^https?://.*\.jpe?g$',
'duration': 76,
'timestamp': 1619028518,
'upload_date': '20210421',
@ -371,11 +365,17 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'id': 'x4iiko0',
'ext': 'mp4',
'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
'description': 'md5:fdcb582c370756293a65cdfbc6ecd90e',
'timestamp': 1467011958,
'upload_date': '20160627',
'uploader': 'France Inter',
'uploader_id': 'x2q2ez',
'upload_date': '20160627',
'view_count': int,
'tags': ['Politique', 'France Inter', '27 juin 2016', 'Linvité de 8h20', 'Cécile Duflot', 'Patrick Cohen'],
'age_limit': 0,
'duration': 640,
'like_count': int,
'thumbnail': r're:https://[^/?#]+/v/[^/?#]+/x1080',
},
'add_ie': ['Dailymotion'],
}, {
@ -405,4 +405,4 @@ def _real_extract(self, url):
r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'),
webpage, 'video id')
return self._make_url_result(video_id)
return self._make_url_result(video_id, url=url)

View file

@ -1,8 +1,7 @@
from .common import InfoExtractor
from .francetv import FranceTVIE
from .francetv import FranceTVBaseInfoExtractor
class LumniIE(InfoExtractor):
class LumniIE(FranceTVBaseInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?lumni\.fr/video/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.lumni.fr/video/l-homme-et-son-environnement-dans-la-revolution-industrielle',
@ -21,4 +20,4 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
r'<div[^>]+data-factoryid\s*=\s*["\']([^"\']+)', webpage, 'video id')
return self.url_result(f'francetv:{video_id}', FranceTVIE, video_id)
return self._make_url_result(video_id, url=url)

View file

@ -13,13 +13,11 @@
from ..utils import (
ExtractorError,
OnDemandPagedList,
bug_reports_message,
clean_html,
float_or_none,
int_or_none,
join_nonempty,
parse_duration,
parse_filesize,
parse_iso8601,
parse_resolution,
qualities,
@ -38,6 +36,8 @@
class NiconicoIE(InfoExtractor):
IE_NAME = 'niconico'
IE_DESC = 'ニコニコ動画'
_GEO_COUNTRIES = ['JP']
_GEO_BYPASS = False
_TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215',
@ -55,25 +55,31 @@ class NiconicoIE(InfoExtractor):
'duration': 33,
'view_count': int,
'comment_count': int,
'genres': ['未設定'],
'tags': [],
'expected_protocol': str,
},
'skip': 'Requires an account',
}, {
# File downloaded with and without credentials are different, so omit
# the md5 field
'url': 'http://www.nicovideo.jp/watch/nm14296458',
'info_dict': {
'id': 'nm14296458',
'ext': 'swf',
'title': '鏡音リン】Dance on media【オリジナル】take2!',
'description': 'md5:689f066d74610b3b22e0f1739add0f58',
'ext': 'mp4',
'title': 'Kagamine Rin】Dance on media【Original】take2!',
'description': 'md5:9368f2b1f4178de64f2602c2f3d6cbf5',
'thumbnail': r're:https?://.*',
'uploader': 'りょうた',
'uploader_id': '18822557',
'upload_date': '20110429',
'timestamp': 1304065916,
'duration': 209,
'duration': 208.0,
'comment_count': int,
'view_count': int,
'genres': ['音楽・サウンド'],
'tags': ['Translation_Request', 'Kagamine_Rin', 'Rin_Original'],
'expected_protocol': str,
},
'skip': 'Requires an account',
}, {
# 'video exists but is marked as "deleted"
# md5 is unstable
@ -107,22 +113,24 @@ class NiconicoIE(InfoExtractor):
}, {
# video not available via `getflv`; "old" HTML5 video
'url': 'http://www.nicovideo.jp/watch/sm1151009',
'md5': '8fa81c364eb619d4085354eab075598a',
'md5': 'f95a3d259172667b293530cc2e41ebda',
'info_dict': {
'id': 'sm1151009',
'ext': 'mp4',
'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)',
'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7',
'description': 'md5:f95a3d259172667b293530cc2e41ebda',
'thumbnail': r're:https?://.*',
'duration': 184,
'timestamp': 1190868283,
'upload_date': '20070927',
'timestamp': 1190835883,
'upload_date': '20070926',
'uploader': 'denden2',
'uploader_id': '1392194',
'view_count': int,
'comment_count': int,
'genres': ['ゲーム'],
'tags': [],
'expected_protocol': str,
},
'skip': 'Requires an account',
}, {
# "New" HTML5 video
# md5 is unstable
@ -132,16 +140,18 @@ class NiconicoIE(InfoExtractor):
'ext': 'mp4',
'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質',
'description': 'md5:e52974af9a96e739196b2c1ca72b5feb',
'timestamp': 1498514060,
'timestamp': 1498481660,
'upload_date': '20170626',
'uploader': 'ゲスト',
'uploader': 'no-namamae',
'uploader_id': '40826363',
'thumbnail': r're:https?://.*',
'duration': 198,
'view_count': int,
'comment_count': int,
'genres': ['アニメ'],
'tags': [],
'expected_protocol': str,
},
'skip': 'Requires an account',
}, {
# Video without owner
'url': 'http://www.nicovideo.jp/watch/sm18238488',
@ -151,7 +161,7 @@ class NiconicoIE(InfoExtractor):
'ext': 'mp4',
'title': '【実写版】ミュータントタートルズ',
'description': 'md5:15df8988e47a86f9e978af2064bf6d8e',
'timestamp': 1341160408,
'timestamp': 1341128008,
'upload_date': '20120701',
'uploader': None,
'uploader_id': None,
@ -159,8 +169,10 @@ class NiconicoIE(InfoExtractor):
'duration': 5271,
'view_count': int,
'comment_count': int,
'genres': ['エンターテイメント'],
'tags': [],
'expected_protocol': str,
},
'skip': 'Requires an account',
}, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True,
@ -353,15 +365,10 @@ def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dm
if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
return None
def extract_video_quality(video_quality):
return parse_filesize('%sB' % self._search_regex(
r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default=''))
format_id = '-'.join(
[remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol])
vid_qual_label = traverse_obj(video_quality, ('metadata', 'label'))
vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate'))
return {
'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']),
@ -370,10 +377,15 @@ def extract_video_quality(video_quality):
'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
'acodec': 'aac',
'vcodec': 'h264',
'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000),
'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000),
'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')),
'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')),
**traverse_obj(audio_quality, ('metadata', {
'abr': ('bitrate', {functools.partial(float_or_none, scale=1000)}),
'asr': ('samplingRate', {int_or_none}),
})),
**traverse_obj(video_quality, ('metadata', {
'vbr': ('bitrate', {functools.partial(float_or_none, scale=1000)}),
'height': ('resolution', 'height', {int_or_none}),
'width': ('resolution', 'width', {int_or_none}),
})),
'quality': -2 if 'low' in video_quality['id'] else None,
'protocol': 'niconico_dmc',
'expected_protocol': dmc_protocol, # XXX: This is not a documented field
@ -383,6 +395,63 @@ def extract_video_quality(video_quality):
}
}
def _yield_dmc_formats(self, api_data, video_id):
dmc_data = traverse_obj(api_data, ('media', 'delivery', 'movie'))
audios = traverse_obj(dmc_data, ('audios', ..., {dict}))
videos = traverse_obj(dmc_data, ('videos', ..., {dict}))
protocols = traverse_obj(dmc_data, ('session', 'protocols', ..., {str}))
if not all((audios, videos, protocols)):
return
for audio_quality, video_quality, protocol in itertools.product(audios, videos, protocols):
if fmt := self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol):
yield fmt
def _yield_dms_formats(self, api_data, video_id):
fmt_filter = lambda _, v: v['isAvailable'] and v['id']
videos = traverse_obj(api_data, ('media', 'domand', 'videos', fmt_filter))
audios = traverse_obj(api_data, ('media', 'domand', 'audios', fmt_filter))
access_key = traverse_obj(api_data, ('media', 'domand', 'accessRightKey', {str}))
track_id = traverse_obj(api_data, ('client', 'watchTrackId', {str}))
if not all((videos, audios, access_key, track_id)):
return
dms_m3u8_url = self._download_json(
f'https://nvapi.nicovideo.jp/v1/watch/{video_id}/access-rights/hls', video_id,
data=json.dumps({
'outputs': list(itertools.product((v['id'] for v in videos), (a['id'] for a in audios)))
}).encode(), query={'actionTrackId': track_id}, headers={
'x-access-right-key': access_key,
'x-frontend-id': 6,
'x-frontend-version': 0,
'x-request-with': 'https://www.nicovideo.jp',
})['data']['contentUrl']
# Getting all audio formats results in duplicate video formats which we filter out later
dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id)
# m3u8 extraction does not provide audio bitrates, so extract from the API data and fix
for audio_fmt in traverse_obj(dms_fmts, lambda _, v: v['vcodec'] == 'none'):
yield {
**audio_fmt,
**traverse_obj(audios, (lambda _, v: audio_fmt['format_id'].startswith(v['id']), {
'format_id': ('id', {str}),
'abr': ('bitRate', {functools.partial(float_or_none, scale=1000)}),
'asr': ('samplingRate', {int_or_none}),
}), get_all=False),
'acodec': 'aac',
'ext': 'm4a',
}
# Sort before removing dupes to keep the format dicts with the lowest tbr
video_fmts = sorted((fmt for fmt in dms_fmts if fmt['vcodec'] != 'none'), key=lambda f: f['tbr'])
self._remove_duplicate_formats(video_fmts)
# Calculate the true vbr/tbr by subtracting the lowest abr
min_abr = min(traverse_obj(audios, (..., 'bitRate', {float_or_none})), default=0) / 1000
for video_fmt in video_fmts:
video_fmt['tbr'] -= min_abr
video_fmt['format_id'] = f'video-{video_fmt["tbr"]:.0f}'
yield video_fmt
def _real_extract(self, url):
video_id = self._match_id(url)
@ -409,19 +478,29 @@ def _real_extract(self, url):
webpage, 'error reason', default=None)
if not error_msg:
raise
raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True)
raise ExtractorError(clean_html(error_msg), expected=True)
formats = []
def get_video_info(*items, get_first=True, **kwargs):
return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs)
quality_info = api_data['media']['delivery']['movie']
session_api_data = quality_info['session']
for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']):
fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol)
if fmt:
formats.append(fmt)
availability = self._availability(**(traverse_obj(api_data, ('payment', 'video', {
'needs_premium': ('isPremium', {bool}),
'needs_subscription': ('isAdmission', {bool}),
})) or {'needs_auth': True}))
formats = [*self._yield_dmc_formats(api_data, video_id),
*self._yield_dms_formats(api_data, video_id)]
if not formats:
fail_msg = clean_html(self._html_search_regex(
r'<p[^>]+\bclass="fail-message"[^>]*>(?P<msg>.+?)</p>',
webpage, 'fail message', default=None, group='msg'))
if fail_msg:
self.to_screen(f'Niconico said: {fail_msg}')
if fail_msg and 'された地域と同じ地域からのみ視聴できます。' in fail_msg:
availability = None
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
elif availability == 'premium_only':
self.raise_login_required('This video requires premium', metadata_available=True)
elif availability == 'subscriber_only':
self.raise_login_required('This video is for members only', metadata_available=True)
elif availability == 'needs_auth':
self.raise_login_required(metadata_available=False)
# Start extracting information
tags = None
@ -440,11 +519,15 @@ def get_video_info(*items, get_first=True, **kwargs):
thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp'])
def get_video_info(*items, get_first=True, **kwargs):
return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs)
return {
'id': video_id,
'_api_data': api_data,
'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None),
'formats': formats,
'availability': availability,
'thumbnails': [{
'id': key,
'url': url,
@ -472,8 +555,11 @@ def get_video_info(*items, get_first=True, **kwargs):
def _get_subtitles(self, video_id, api_data):
comments_info = traverse_obj(api_data, ('comment', 'nvComment', {dict})) or {}
if not comments_info.get('server'):
return
danmaku = traverse_obj(self._download_json(
f'{comments_info.get("server")}/v1/threads', video_id, data=json.dumps({
f'{comments_info["server"]}/v1/threads', video_id, data=json.dumps({
'additionals': {},
'params': comments_info.get('params'),
'threadKey': comments_info.get('threadKey'),
@ -489,10 +575,6 @@ def _get_subtitles(self, video_id, api_data):
note='Downloading comments', errnote='Failed to download comments'),
('data', 'threads', ..., 'comments', ...))
if not danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',

View file

@ -35,6 +35,7 @@ class NTVRuIE(InfoExtractor):
'duration': 172,
'view_count': int,
},
'skip': '404 Not Found',
}, {
'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
'md5': '82dbd49b38e3af1d00df16acbeab260c',
@ -78,7 +79,8 @@ class NTVRuIE(InfoExtractor):
}]
_VIDEO_ID_REGEXES = [
r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
r'<meta property="og:url" content="https?://www\.ntv\.ru/video/(\d+)',
r'<meta property="og:video:(?:url|iframe)" content="https?://www\.ntv\.ru/embed/(\d+)',
r'<video embed=[^>]+><id>(\d+)</id>',
r'<video restriction[^>]+><key>(\d+)</key>',
]

View file

@ -28,6 +28,29 @@ class RaiBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
def _fix_m3u8_formats(self, media_url, video_id):
fmts = self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
# Fix malformed m3u8 manifests by setting audio-only/video-only formats
for f in fmts:
if not f.get('acodec'):
f['acodec'] = 'mp4a'
if not f.get('vcodec'):
f['vcodec'] = 'avc1'
man_url = f['url']
if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only
f['vcodec'] = 'none'
elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only
f['acodec'] = 'none'
else: # video+audio
if f['acodec'] == 'none':
f['acodec'] = 'mp4a'
if f['vcodec'] == 'none':
f['vcodec'] = 'avc1'
return fmts
def _extract_relinker_info(self, relinker_url, video_id, audio_only=False):
def fix_cdata(s):
# remove \r\n\t before and after <![CDATA[ ]]> to avoid
@ -69,8 +92,7 @@ def fix_cdata(s):
'format_id': 'https-mp3',
})
elif ext == 'm3u8' or 'format=m3u8' in media_url:
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
formats.extend(self._fix_m3u8_formats(media_url, video_id))
elif ext == 'f4m':
# very likely no longer needed. Cannot find any url that uses it.
manifest_url = update_url_query(
@ -153,10 +175,10 @@ def get_format_info(tbr):
'format_id': f'https-{tbr}',
'width': format_copy.get('width'),
'height': format_copy.get('height'),
'tbr': format_copy.get('tbr'),
'vcodec': format_copy.get('vcodec'),
'acodec': format_copy.get('acodec'),
'fps': format_copy.get('fps'),
'tbr': format_copy.get('tbr') or tbr,
'vcodec': format_copy.get('vcodec') or 'avc1',
'acodec': format_copy.get('acodec') or 'mp4a',
'fps': format_copy.get('fps') or 25,
} if format_copy else {
'format_id': f'https-{tbr}',
'width': _QUALITY[tbr][0],
@ -245,7 +267,7 @@ class RaiPlayIE(RaiBaseIE):
'series': 'Report',
'season': '2013/14',
'subtitles': {'it': 'count:4'},
'release_year': 2022,
'release_year': 2024,
'episode': 'Espresso nel caffè - 07/04/2014',
'timestamp': 1396919880,
'upload_date': '20140408',
@ -253,7 +275,7 @@ class RaiPlayIE(RaiBaseIE):
},
'params': {'skip_download': True},
}, {
# 1080p direct mp4 url
# 1080p
'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html',
'md5': 'aeda7243115380b2dd5e881fd42d949a',
'info_dict': {
@ -274,7 +296,7 @@ class RaiPlayIE(RaiBaseIE):
'episode': 'Senza occhi',
'timestamp': 1637318940,
'upload_date': '20211119',
'formats': 'count:12',
'formats': 'count:7',
},
'params': {'skip_download': True},
'expected_warnings': ['Video not available. Likely due to geo-restriction.']
@ -527,7 +549,7 @@ class RaiPlaySoundPlaylistIE(InfoExtractor):
'info_dict': {
'id': 'ilruggitodelconiglio',
'title': 'Il Ruggito del Coniglio',
'description': 'md5:48cff6972435964284614d70474132e6',
'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e',
},
'playlist_mincount': 65,
}, {
@ -634,19 +656,20 @@ def _real_extract(self, url):
}
class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
class RaiNewsIE(RaiBaseIE):
_VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html'
_EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)']
_TESTS = [{
# new rainews player (#3911)
'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html',
'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html',
'info_dict': {
'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf',
'id': '31e8017c-845c-43f5-9c48-245b43c3a079',
'ext': 'mp4',
'title': 'Puntata del 29/05/2022',
'duration': 1589,
'upload_date': '20220529',
'title': 'md5:1e81364b09de4a149042bac3c7d36f0b',
'duration': 196,
'upload_date': '20240225',
'uploader': 'rainews',
'formats': 'count:2',
},
'params': {'skip_download': True},
}, {
@ -659,7 +682,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE
'description': 'I film in uscita questa settimana.',
'thumbnail': r're:^https?://.*\.png$',
'duration': 833,
'upload_date': '20161103'
'upload_date': '20161103',
'formats': 'count:8',
},
'params': {'skip_download': True},
'expected_warnings': ['unable to extract player_data'],
@ -684,7 +708,7 @@ def _real_extract(self, url):
if not relinker_url:
# fallback on old implementation for some old content
try:
return self._extract_from_content_id(video_id, url)
return RaiIE._real_extract(self, url)
except GeoRestrictedError:
raise
except ExtractorError as e:

View file

@ -247,17 +247,17 @@ class MujRozhlasIE(RozhlasBaseIE):
'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli',
'md5': '6f8fd68663e64936623e67c152a669e0',
'info_dict': {
'id': '10739193',
'id': '10787730',
'ext': 'mp3',
'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli',
'description': 'md5:db7141e9caaedc9041ec7cefb9a62908',
'timestamp': 1684915200,
'modified_timestamp': 1684922446,
'modified_timestamp': 1687550432,
'series': 'Vykopávky',
'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg',
'channel_id': 'radio-wave',
'upload_date': '20230524',
'modified_date': '20230524',
'modified_date': '20230623',
},
}, {
# serial extraction
@ -277,6 +277,26 @@ class MujRozhlasIE(RozhlasBaseIE):
'title': 'Nespavci',
'description': 'md5:c430adcbf9e2b9eac88b745881e814dc',
},
}, {
# serialPart
'url': 'https://www.mujrozhlas.cz/povidka/gustavo-adolfo-becquer-hora-duchu',
'info_dict': {
'id': '8889035',
'ext': 'm4a',
'title': 'Gustavo Adolfo Bécquer: Hora duchů',
'description': 'md5:343a15257b376c276e210b78e900ffea',
'chapter': 'Hora duchů a Polibek dva tajemné příběhy Gustava Adolfa Bécquera',
'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/2adfe1387fb140634be725c1ccf26214.jpg',
'timestamp': 1708173000,
'episode': 'Episode 1',
'episode_number': 1,
'series': 'Povídka',
'modified_date': '20240217',
'upload_date': '20240217',
'modified_timestamp': 1708173198,
'channel_id': 'vltava',
},
'params': {'skip_download': 'dash'},
}]
def _call_api(self, path, item_id, msg='API JSON'):
@ -322,7 +342,7 @@ def _real_extract(self, url):
entity = info['siteEntityBundle']
if entity == 'episode':
if entity in ('episode', 'serialPart'):
return self._extract_audio_entry(self._call_api(
'episodes', info['contentId'], 'episode info API JSON'))

View file

@ -1,5 +1,5 @@
from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj
from ..utils import ExtractorError, int_or_none, traverse_obj
class SwearnetEpisodeIE(InfoExtractor):
@ -51,7 +51,13 @@ def _real_extract(self, url):
display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num')
webpage = self._download_webpage(url, display_id)
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
try:
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
except ExtractorError:
if 'Upgrade Now' in webpage:
self.raise_login_required()
raise
json_data = self._download_json(
f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0]

View file

@ -6,7 +6,7 @@
import time
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse
from ..compat import compat_urllib_parse_urlparse
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
@ -15,7 +15,6 @@
UserNotLive,
determine_ext,
format_field,
get_first,
int_or_none,
join_nonempty,
merge_dicts,
@ -51,7 +50,13 @@ def _create_url(user_id, video_id):
def _get_sigi_state(self, webpage, display_id):
return self._search_json(
r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
'sigi state', display_id, end_pattern=r'</script>')
'sigi state', display_id, end_pattern=r'</script>', default={})
def _get_universal_data(self, webpage, display_id):
return traverse_obj(self._search_json(
r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
'universal data', display_id, end_pattern=r'</script>', default={}),
('__DEFAULT_SCOPE__', {dict})) or {}
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
@ -219,8 +224,8 @@ def audio_meta(url):
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
if res:
known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height'))
known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width'))
known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
parsed_meta.update(known_resolutions.get(res, {}))
add_meta.setdefault('height', int_or_none(res[:-1]))
return [{
@ -237,22 +242,26 @@ def extract_addr(addr, add_meta={}):
# Hack: Add direct video links first to prioritize them when removing duplicate formats
formats = []
width = int_or_none(video_info.get('width'))
height = int_or_none(video_info.get('height'))
if video_info.get('play_addr'):
formats.extend(extract_addr(video_info['play_addr'], {
'format_id': 'play_addr',
'format_note': 'Direct video',
'vcodec': 'h265' if traverse_obj(
video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
'width': video_info.get('width'),
'height': video_info.get('height'),
'width': width,
'height': height,
}))
if video_info.get('download_addr'):
formats.extend(extract_addr(video_info['download_addr'], {
download_addr = video_info['download_addr']
dl_width = int_or_none(download_addr.get('width'))
formats.extend(extract_addr(download_addr, {
'format_id': 'download_addr',
'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
'vcodec': 'h264',
'width': video_info.get('width'),
'height': video_info.get('height'),
'width': dl_width or width,
'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong
'preference': -2 if video_info.get('has_watermark') else -1,
}))
if video_info.get('play_addr_h264'):
@ -315,9 +324,6 @@ def extract_addr(addr, add_meta={}):
return {
'id': aweme_id,
'extractor_key': TikTokIE.ie_key(),
'extractor': TikTokIE.IE_NAME,
'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
**traverse_obj(aweme_detail, {
'title': ('desc', {str}),
'description': ('desc', {str}),
@ -609,11 +615,12 @@ class TikTokIE(TikTokBaseIE):
'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
'creator': 'MoxyPatch',
'creators': ['MoxyPatch'],
'uploader': 'moxypatch',
'uploader_id': '7039142049363379205',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
'artist': 'your worst nightmare',
'artists': ['your worst nightmare'],
'track': 'original sound',
'upload_date': '20230303',
'timestamp': 1677866781,
@ -651,7 +658,7 @@ class TikTokIE(TikTokBaseIE):
'comment_count': int,
'thumbnail': r're:^https://.+\.webp',
},
'params': {'format': 'bytevc1_1080p_808907-0'},
'skip': 'Unavailable via feed API, no formats available via web',
}, {
# Slideshow, audio-only m4a format
'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
@ -688,24 +695,35 @@ def _real_extract(self, url):
try:
return self._extract_aweme_app(video_id)
except ExtractorError as e:
e.expected = True
self.report_warning(f'{e}; trying with webpage')
url = self._create_url(user_id, video_id)
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'})
next_data = self._search_nextjs_data(webpage, video_id, default='{}')
if next_data:
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
else:
sigi_data = self._get_sigi_state(webpage, video_id)
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
if status == 0:
if universal_data := self._get_universal_data(webpage, video_id):
self.write_debug('Found universal data for rehydration')
status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
elif sigi_data := self._get_sigi_state(webpage, video_id):
self.write_debug('Found sigi state data')
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'):
self.write_debug('Found next.js data')
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
else:
raise ExtractorError('Unable to extract webpage video data')
if video_data and status == 0:
return self._parse_aweme_video_web(video_data, url, video_id)
elif status == 10216:
raise ExtractorError('This video is private', expected=True)
raise ExtractorError('Video not available', video_id=video_id)
raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
class TikTokUserIE(TikTokBaseIE):
@ -921,20 +939,23 @@ class DouyinIE(TikTokBaseIE):
_VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.douyin.com/video/6961737553342991651',
'md5': 'a97db7e3e67eb57bf40735c022ffa228',
'md5': '9ecce7bc5b302601018ecb2871c63a75',
'info_dict': {
'id': '6961737553342991651',
'ext': 'mp4',
'title': '#杨超越 小小水手带你去远航❤️',
'description': '#杨超越 小小水手带你去远航❤️',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 19782,
'creators': ['杨超越'],
'duration': 19,
'timestamp': 1620905839,
'upload_date': '20210513',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -943,20 +964,23 @@ class DouyinIE(TikTokBaseIE):
},
}, {
'url': 'https://www.douyin.com/video/6982497745948921092',
'md5': '34a87ebff3833357733da3fe17e37c0e',
'md5': '15c5e660b7048af3707304e3cc02bbb5',
'info_dict': {
'id': '6982497745948921092',
'ext': 'mp4',
'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'uploader': '0731chaoyue',
'uploader_id': '408654318141572',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'creator': '杨超越工作室',
'duration': 42479,
'creators': ['杨超越工作室'],
'duration': 42,
'timestamp': 1625739481,
'upload_date': '20210708',
'track': '@杨超越工作室创作的原声',
'artists': ['杨超越工作室'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -965,20 +989,23 @@ class DouyinIE(TikTokBaseIE):
},
}, {
'url': 'https://www.douyin.com/video/6953975910773099811',
'md5': 'dde3302460f19db59c47060ff013b902',
'md5': '0e6443758b8355db9a3c34864a4276be',
'info_dict': {
'id': '6953975910773099811',
'ext': 'mp4',
'title': '#一起看海 出现在你的夏日里',
'description': '#一起看海 出现在你的夏日里',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 17343,
'creators': ['杨超越'],
'duration': 17,
'timestamp': 1619098692,
'upload_date': '20210422',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -1004,20 +1031,23 @@ class DouyinIE(TikTokBaseIE):
'skip': 'No longer available',
}, {
'url': 'https://www.douyin.com/video/6963263655114722595',
'md5': 'cf9f11f0ec45d131445ec2f06766e122',
'md5': '1440bcf59d8700f8e014da073a4dfea8',
'info_dict': {
'id': '6963263655114722595',
'ext': 'mp4',
'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 15115,
'creators': ['杨超越'],
'duration': 15,
'timestamp': 1621261163,
'upload_date': '20210517',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -1025,34 +1055,23 @@ class DouyinIE(TikTokBaseIE):
'thumbnail': r're:https?://.+\.jpe?g',
},
}]
_APP_VERSIONS = [('23.3.0', '230300')]
_APP_NAME = 'aweme'
_AID = 1128
_API_HOSTNAME = 'aweme.snssdk.com'
_UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
_WEBPAGE_HOST = 'https://www.douyin.com/'
def _real_extract(self, url):
video_id = self._match_id(url)
try:
return self._extract_aweme_app(video_id)
except ExtractorError as e:
e.expected = True
self.to_screen(f'{e}; trying with webpage')
webpage = self._download_webpage(url, video_id)
render_data = self._search_json(
r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id,
contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote)
if not render_data:
detail = traverse_obj(self._download_json(
'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
'Downloading web detail JSON', 'Failed to download web detail JSON',
query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
if not detail:
# TODO: Run verification challenge code to generate signature cookies
cookies = self._get_cookies(self._WEBPAGE_HOST)
expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid')
raise ExtractorError(
'Fresh cookies (not necessarily logged in) are needed', expected=expected)
'Fresh cookies (not necessarily logged in) are needed',
expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id)
return self._parse_aweme_video_app(detail)
class TikTokVMIE(InfoExtractor):
@ -1181,7 +1200,7 @@ def _real_extract(self, url):
url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
if webpage:
data = try_call(lambda: self._get_sigi_state(webpage, uploader or room_id))
data = self._get_sigi_state(webpage, uploader or room_id)
room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
or room_id)

View file

@ -21,6 +21,7 @@
parse_qs,
smuggle_url,
str_or_none,
traverse_obj,
try_get,
unified_timestamp,
unsmuggle_url,
@ -48,17 +49,15 @@ def _unsmuggle_headers(self, url):
return url, data, headers
def _perform_login(self, username, password):
webpage = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
token, vuid = self._extract_xsrft_and_vuid(webpage)
viewer = self._download_json('https://vimeo.com/_next/viewer', None, 'Downloading login token')
data = {
'action': 'login',
'email': username,
'password': password,
'service': 'vimeo',
'token': token,
'token': viewer['xsrft'],
}
self._set_vimeo_cookie('vuid', vuid)
self._set_vimeo_cookie('vuid', viewer['vuid'])
try:
self._download_webpage(
self._LOGIN_URL, None, 'Logging in',
@ -123,7 +122,13 @@ def _parse_config(self, config, video_id):
video_data = config['video']
video_title = video_data.get('title')
live_event = video_data.get('live_event') or {}
is_live = live_event.get('status') == 'started'
live_status = {
'pending': 'is_upcoming',
'active': 'is_upcoming',
'started': 'is_live',
'ended': 'post_live',
}.get(live_event.get('status'))
is_live = live_status == 'is_live'
request = config.get('request') or {}
formats = []
@ -232,7 +237,8 @@ def _parse_config(self, config, video_id):
'chapters': chapters or None,
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
'live_status': live_status,
'release_timestamp': traverse_obj(live_event, ('ingest', 'scheduled_start_time', {parse_iso8601})),
# Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
# at the same time without actual units specified.
'_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'),

View file

@ -114,9 +114,9 @@
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID',
'clientVersion': '17.31.35',
'clientVersion': '18.11.34',
'androidSdkVersion': 30,
'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip'
'userAgent': 'com.google.android.youtube/18.11.34 (Linux; U; Android 11) gzip'
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
@ -127,9 +127,9 @@
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'ANDROID_EMBEDDED_PLAYER',
'clientVersion': '17.31.35',
'clientVersion': '18.11.34',
'androidSdkVersion': 30,
'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip'
'userAgent': 'com.google.android.youtube/18.11.34 (Linux; U; Android 11) gzip'
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 55,
@ -168,9 +168,9 @@
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS',
'clientVersion': '17.33.2',
'clientVersion': '18.11.34',
'deviceModel': 'iPhone14,3',
'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
'userAgent': 'com.google.ios.youtube/18.11.34 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
@ -180,9 +180,9 @@
'INNERTUBE_CONTEXT': {
'client': {
'clientName': 'IOS_MESSAGES_EXTENSION',
'clientVersion': '17.33.2',
'clientVersion': '18.11.34',
'deviceModel': 'iPhone14,3',
'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
'userAgent': 'com.google.ios.youtube/18.11.34 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)'
},
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 66,
@ -3640,15 +3640,28 @@ def _get_requested_clients(self, url, smuggled_data):
return orderedSet(requested_clients)
def _invalid_player_response(self, pr, video_id):
# YouTube may return a different video player response than expected.
# See: https://github.com/TeamNewPipe/NewPipe/issues/8713
if (pr_id := traverse_obj(pr, ('videoDetails', 'videoId'))) != video_id:
return pr_id
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data):
initial_pr = None
if webpage:
initial_pr = self._search_json(
self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
prs = []
if initial_pr and not self._invalid_player_response(initial_pr, video_id):
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
# stripped out even if not requested by the user
# See: https://github.com/yt-dlp/yt-dlp/issues/501
prs.append({**initial_pr, 'streamingData': None})
all_clients = set(clients)
clients = clients[::-1]
prs = []
def append_client(*client_names):
""" Append the first client name that exists but not already used """
@ -3660,18 +3673,9 @@ def append_client(*client_names):
all_clients.add(actual_client)
return
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
# stripped out even if not requested by the user
# See: https://github.com/yt-dlp/yt-dlp/issues/501
if initial_pr:
pr = dict(initial_pr)
pr['streamingData'] = None
prs.append(pr)
last_error = None
tried_iframe_fallback = False
player_url = None
skipped_clients = {}
while clients:
client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = master_ytcfg if client == 'web' else {}
@ -3692,26 +3696,19 @@ def append_client(*client_names):
pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data)
except ExtractorError as e:
if last_error:
self.report_warning(last_error)
last_error = e
self.report_warning(e)
continue
if pr:
# YouTube may return a different video player response than expected.
# See: https://github.com/TeamNewPipe/NewPipe/issues/8713
pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId'))
if pr_video_id and pr_video_id != video_id:
self.report_warning(
f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message())
else:
# Save client name for introspection later
name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {}
sd[STREAMING_DATA_CLIENT_NAME] = name
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
f[STREAMING_DATA_CLIENT_NAME] = name
prs.append(pr)
if pr_id := self._invalid_player_response(pr, video_id):
skipped_clients[client] = pr_id
elif pr:
# Save client name for introspection later
name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {}
sd[STREAMING_DATA_CLIENT_NAME] = name
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
f[STREAMING_DATA_CLIENT_NAME] = name
prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated:
@ -3722,10 +3719,15 @@ def append_client(*client_names):
elif not variant:
append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded')
if last_error:
if not len(prs):
raise last_error
self.report_warning(last_error)
if skipped_clients:
self.report_warning(
f'Skipping player responses from {"/".join(skipped_clients)} clients '
f'(got player responses for video "{"/".join(set(skipped_clients.values()))}" instead of "{video_id}")')
if not prs:
raise ExtractorError(
'All player responses are invalid. Your IP is likely being blocked by Youtube', expected=True)
elif not prs:
raise ExtractorError('Failed to extract any player response')
return prs, player_url
def _needs_live_processing(self, live_status, duration):

118
yt_dlp/extractor/zenporn.py Normal file
View file

@ -0,0 +1,118 @@
import base64
import binascii
from .common import InfoExtractor
from ..utils import ExtractorError, determine_ext, unified_strdate, url_or_none
from ..utils.traversal import traverse_obj
class ZenPornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?zenporn\.com/video/(?P<id>\d+)'
_TESTS = [{
'url': 'https://zenporn.com/video/15627016/desi-bhabi-ki-chudai',
'md5': '07bd576b5920714d74975c054ca28dee',
'info_dict': {
'id': '9563799',
'display_id': '15627016',
'ext': 'mp4',
'title': 'md5:669eafd3bbc688aa29770553b738ada2',
'description': '',
'thumbnail': 'md5:2fc044a19bab450fef8f1931e7920a18',
'upload_date': '20230925',
'uploader': 'md5:9fae59847f1f58d1da8f2772016c12f3',
'age_limit': 18,
}
}, {
'url': 'https://zenporn.com/video/15570701',
'md5': 'acba0d080d692664fcc8c4e5502b1a67',
'info_dict': {
'id': '2297875',
'display_id': '15570701',
'ext': 'mp4',
'title': 'md5:47aebdf87644ec91e8b1a844bc832451',
'description': '',
'thumbnail': 'https://mstn.nv7s.com/contents/videos_screenshots/2297000/2297875/480x270/1.jpg',
'upload_date': '20230921',
'uploader': 'Lois Clarke',
'age_limit': 18,
}
}, {
'url': 'https://zenporn.com/video/8531117/amateur-students-having-a-fuck-fest-at-club/',
'md5': '67411256aa9451449e4d29f3be525541',
'info_dict': {
'id': '12791908',
'display_id': '8531117',
'ext': 'mp4',
'title': 'Amateur students having a fuck fest at club',
'description': '',
'thumbnail': 'https://tn.txxx.tube/contents/videos_screenshots/12791000/12791908/288x162/1.jpg',
'upload_date': '20191005',
'uploader': 'Jackopenass',
'age_limit': 18,
}
}, {
'url': 'https://zenporn.com/video/15872038/glad-you-came/',
'md5': '296ccab437f5bac6099433768449d8e1',
'info_dict': {
'id': '111585',
'display_id': '15872038',
'ext': 'mp4',
'title': 'Glad You Came',
'description': '',
'thumbnail': 'https://vpim.m3pd.com/contents/videos_screenshots/111000/111585/480x270/1.jpg',
'upload_date': '20231024',
'uploader': 'Martin Rudenko',
'age_limit': 18,
}
}]
def _gen_info_url(self, ext_domain, extr_id, lifetime=86400):
""" This function is a reverse engineering from the website javascript """
result = '/'.join(str(int(extr_id) // i * i) for i in (1_000_000, 1_000, 1))
return f'https://{ext_domain}/api/json/video/{lifetime}/{result}.json'
@staticmethod
def _decode_video_url(encoded_url):
""" This function is a reverse engineering from the website javascript """
# Replace lookalike characters and standardize map
translation = str.maketrans('АВСЕМ.,~', 'ABCEM+/=')
try:
return base64.b64decode(encoded_url.translate(translation), validate=True).decode()
except (binascii.Error, ValueError):
return None
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
ext_domain, video_id = self._search_regex(
r'https://(?P<ext_domain>[\w.-]+\.\w{3})/embed/(?P<extr_id>\d+)/',
webpage, 'embed info', group=('ext_domain', 'extr_id'))
info_json = self._download_json(
self._gen_info_url(ext_domain, video_id), video_id, fatal=False)
video_json = self._download_json(
f'https://{ext_domain}/api/videofile.php', video_id, query={
'video_id': video_id,
'lifetime': 8640000,
}, note='Downloading video file JSON', errnote='Failed to download video file JSON')
decoded_url = self._decode_video_url(video_json[0]['video_url'])
if not decoded_url:
raise ExtractorError('Unable to decode the video url')
return {
'id': video_id,
'display_id': display_id,
'ext': traverse_obj(video_json, (0, 'format', {determine_ext})),
'url': f'https://{ext_domain}{decoded_url}',
'age_limit': 18,
**traverse_obj(info_json, ('video', {
'title': ('title', {str}),
'description': ('description', {str}),
'thumbnail': ('thumb', {url_or_none}),
'upload_date': ('post_date', {unified_strdate}),
'uploader': ('user', 'username', {str}),
})),
}

View file

@ -68,6 +68,7 @@ def __init__(self, logger, verbose=False):
def close(self):
for handler in self.handlers.values():
handler.close()
self.handlers = {}
def add_handler(self, handler: RequestHandler):
"""Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""