[ie/francetv] Fix m3u8 formats extraction (#9347)

Authored by: bashonly
This commit is contained in:
bashonly 2024-03-03 17:19:52 -06:00 committed by GitHub
parent 40966e8da2
commit ede624d1db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,17 +1,16 @@
import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
from ..networking import HEADRequest from ..networking import HEADRequest
from ..utils import ( from ..utils import (
ExtractorError,
determine_ext, determine_ext,
filter_dict, filter_dict,
format_field, format_field,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
parse_iso8601, parse_iso8601,
parse_qs,
smuggle_url, smuggle_url,
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
@ -20,53 +19,31 @@
class FranceTVBaseInfoExtractor(InfoExtractor): class FranceTVBaseInfoExtractor(InfoExtractor):
def _make_url_result(self, video_or_full_id, catalog=None, url=None): def _make_url_result(self, video_id, url=None):
full_id = 'francetv:%s' % video_or_full_id video_id = video_id.split('@')[0] # for compat with old @catalog IDs
if '@' not in video_or_full_id and catalog: full_id = f'francetv:{video_id}'
full_id += '@%s' % catalog
if url: if url:
full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname}) full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname})
return self.url_result( return self.url_result(full_id, FranceTVIE, video_id)
full_id, ie=FranceTVIE.ie_key(),
video_id=video_or_full_id.split('@')[0])
class FranceTVIE(InfoExtractor): class FranceTVIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'francetv:(?P<id>[^@#]+)'
(?:
https?://
sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\?
.*?\bidDiffusion=[^&]+|
(?:
https?://videos\.francetv\.fr/video/|
francetv:
)
(?P<id>[^@]+)(?:@(?P<catalog>.+))?
)
'''
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1']
_GEO_COUNTRIES = ['FR'] _GEO_COUNTRIES = ['FR']
_GEO_BYPASS = False _GEO_BYPASS = False
_TESTS = [{ _TESTS = [{
# without catalog 'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1',
'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0',
'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f',
'info_dict': { 'info_dict': {
'id': '162311093', 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
'ext': 'mp4', 'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus', 'title': '13h15, le dimanche... - Les mystères de Jésus',
'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
'timestamp': 1502623500, 'timestamp': 1502623500,
'duration': 2580,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170813', 'upload_date': '20170813',
}, },
}, { 'params': {'skip_download': 'm3u8'},
# with catalog
'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4',
'only_matching': True,
}, {
'url': 'http://videos.francetv.fr/video/NI_657393@Regions',
'only_matching': True,
}, { }, {
'url': 'francetv:162311093', 'url': 'francetv:162311093',
'only_matching': True, 'only_matching': True,
@ -88,8 +65,7 @@ class FranceTVIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _extract_video(self, video_id, catalogue=None, hostname=None): def _extract_video(self, video_id, hostname=None):
# TODO: Investigate/remove 'catalogue'/'catalog'; it has not been used since 2021
is_live = None is_live = None
videos = [] videos = []
title = None title = None
@ -101,12 +77,13 @@ def _extract_video(self, video_id, catalogue=None, hostname=None):
timestamp = None timestamp = None
spritesheets = None spritesheets = None
for device_type in ('desktop', 'mobile'): # desktop+chrome returns dash; mobile+safari returns hls
for device_type, browser in [('desktop', 'chrome'), ('mobile', 'safari')]:
dinfo = self._download_json( dinfo = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, f'https://k7.ftven.fr/videos/{video_id}', video_id,
video_id, f'Downloading {device_type} video JSON', query=filter_dict({ f'Downloading {device_type} {browser} video JSON', query=filter_dict({
'device_type': device_type, 'device_type': device_type,
'browser': 'chrome', 'browser': browser,
'domain': hostname, 'domain': hostname,
}), fatal=False) }), fatal=False)
@ -156,23 +133,28 @@ def _extract_video(self, video_id, catalogue=None, hostname=None):
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'f4m': if ext == 'f4m':
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id=format_id, fatal=False)) video_url, video_id, f4m_id=format_id or ext, fatal=False))
elif ext == 'm3u8': elif ext == 'm3u8':
format_id = format_id or 'hls'
fmts, subs = self._extract_m3u8_formats_and_subtitles( fmts, subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
entry_protocol='m3u8_native', m3u8_id=format_id, for f in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none' and v.get('tbr') is None):
fatal=False) if mobj := re.match(rf'{format_id}-[Aa]udio-\w+-(?P<bitrate>\d+)', f['format_id']):
f.update({
'tbr': int_or_none(mobj.group('bitrate')),
'acodec': 'mp4a',
})
formats.extend(fmts) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles) self._merge_subtitles(subs, target=subtitles)
elif ext == 'mpd': elif ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles( fmts, subs = self._extract_mpd_formats_and_subtitles(
video_url, video_id, mpd_id=format_id, fatal=False) video_url, video_id, mpd_id=format_id or 'dash', fatal=False)
formats.extend(fmts) formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles) self._merge_subtitles(subs, target=subtitles)
elif video_url.startswith('rtmp'): elif video_url.startswith('rtmp'):
formats.append({ formats.append({
'url': video_url, 'url': video_url,
'format_id': 'rtmp-%s' % format_id, 'format_id': join_nonempty('rtmp', format_id),
'ext': 'flv', 'ext': 'flv',
}) })
else: else:
@ -211,7 +193,7 @@ def _extract_video(self, video_id, catalogue=None, hostname=None):
# a 10×10 grid of thumbnails corresponding to approximately # a 10×10 grid of thumbnails corresponding to approximately
# 2 seconds of the video; the last spritesheet may be shorter # 2 seconds of the video; the last spritesheet may be shorter
'duration': 200, 'duration': 200,
} for sheet in spritesheets] } for sheet in traverse_obj(spritesheets, (..., {url_or_none}))]
}) })
return { return {
@ -227,22 +209,15 @@ def _extract_video(self, video_id, catalogue=None, hostname=None):
'series': title if episode_number else None, 'series': title if episode_number else None,
'episode_number': int_or_none(episode_number), 'episode_number': int_or_none(episode_number),
'season_number': int_or_none(season_number), 'season_number': int_or_none(season_number),
'_format_sort_fields': ('res', 'tbr', 'proto'), # prioritize m3u8 over dash
} }
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
mobj = self._match_valid_url(url) video_id = self._match_id(url)
video_id = mobj.group('id') hostname = smuggled_data.get('hostname') or 'www.france.tv'
catalog = mobj.group('catalog')
if not video_id: return self._extract_video(video_id, hostname=hostname)
qs = parse_qs(url)
video_id = qs.get('idDiffusion', [None])[0]
catalog = qs.get('catalogue', [None])[0]
if not video_id:
raise ExtractorError('Invalid URL', expected=True)
return self._extract_video(video_id, catalog, hostname=smuggled_data.get('hostname'))
class FranceTVSiteIE(FranceTVBaseInfoExtractor): class FranceTVSiteIE(FranceTVBaseInfoExtractor):
@ -264,6 +239,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
}, },
'add_ie': [FranceTVIE.ie_key()], 'add_ie': [FranceTVIE.ie_key()],
}, { }, {
# geo-restricted
'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
'info_dict': { 'info_dict': {
'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44', 'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44',
@ -322,17 +298,16 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
catalogue = None
video_id = self._search_regex( video_id = self._search_regex(
r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1', r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'video id', default=None, group='id') webpage, 'video id', default=None, group='id')
if not video_id: if not video_id:
video_id, catalogue = self._html_search_regex( video_id = self._html_search_regex(
r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@"]+@[^"]+)"',
webpage, 'video ID').split('@') webpage, 'video ID')
return self._make_url_result(video_id, catalogue, url=url) return self._make_url_result(video_id, url=url)
class FranceTVInfoIE(FranceTVBaseInfoExtractor): class FranceTVInfoIE(FranceTVBaseInfoExtractor):
@ -346,8 +321,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Soir 3', 'title': 'Soir 3',
'upload_date': '20190822', 'upload_date': '20190822',
'timestamp': 1566510900, 'timestamp': 1566510730,
'description': 'md5:72d167097237701d6e8452ff03b83c00', 'thumbnail': r're:^https?://.*\.jpe?g$',
'duration': 1637,
'subtitles': { 'subtitles': {
'fr': 'mincount:2', 'fr': 'mincount:2',
}, },
@ -362,8 +338,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'info_dict': { 'info_dict': {
'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482', 'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Covid-19 : une situation catastrophique à New Dehli', 'title': 'Covid-19 : une situation catastrophique à New Dehli - Édition du mercredi 21 avril 2021',
'thumbnail': str, 'thumbnail': r're:^https?://.*\.jpe?g$',
'duration': 76, 'duration': 76,
'timestamp': 1619028518, 'timestamp': 1619028518,
'upload_date': '20210421', 'upload_date': '20210421',
@ -389,11 +365,17 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
'id': 'x4iiko0', 'id': 'x4iiko0',
'ext': 'mp4', 'ext': 'mp4',
'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', 'description': 'md5:fdcb582c370756293a65cdfbc6ecd90e',
'timestamp': 1467011958, 'timestamp': 1467011958,
'upload_date': '20160627',
'uploader': 'France Inter', 'uploader': 'France Inter',
'uploader_id': 'x2q2ez', 'uploader_id': 'x2q2ez',
'upload_date': '20160627',
'view_count': int,
'tags': ['Politique', 'France Inter', '27 juin 2016', 'Linvité de 8h20', 'Cécile Duflot', 'Patrick Cohen'],
'age_limit': 0,
'duration': 640,
'like_count': int,
'thumbnail': r're:https://[^/?#]+/v/[^/?#]+/x1080',
}, },
'add_ie': ['Dailymotion'], 'add_ie': ['Dailymotion'],
}, { }, {