[extractor/common] Added ability to force CODECS for malformed m3u8 manifest

- fixes #887
A+V stream getting recognized as only V

- fixes #2918 and https://github.com/ytdl-org/youtube-dl/issues/27830
only V stream getting recognized as A+V

- added tests in test_InfoExtractor with the 2 manifest from the issues
This commit is contained in:
nixxo 2022-12-27 21:10:45 +01:00
parent d61ef7f343
commit a7b899f872
No known key found for this signature in database
GPG key ID: E0DE62EF9A9BFAB2
4 changed files with 209 additions and 9 deletions

View file

@ -864,6 +864,7 @@ def test_parse_m3u8_formats(self):
'height': 1080,
'vcodec': 'avc1.64002a',
}],
{},
{}
),
(
@ -1031,14 +1032,182 @@ def test_parse_m3u8_formats(self):
'ext': 'vtt',
'protocol': 'm3u8_native'
}],
}
},
{}
),
(
'rai-383bca47-d8a1-49c0-876c-f20a2d56d4c1-playlist',
'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
[{
'format_id': 'aac-Audiodescrizione',
'format_note': 'Audiodescrizione',
'url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/chunklist_b192400_ao_slAudiodescrizione_t64QXVkaW9kZXNjcml6aW9uZV9hdWRpbw==.m3u8',
'manifest_url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
'language': 'Audiodescrizione',
'ext': 'mp4',
'protocol': 'm3u8_native',
'vcodec': 'none',
'audio_ext': 'mp4',
'video_ext': 'none',
}, {
'format_id': 'aac-Italiano',
'format_note': 'Italiano',
'url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/chunklist_b192400_ao_slItaliano_t64SXRhbGlhbm9fYXVkaW8=.m3u8',
'manifest_url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
'language': 'Italiano',
'ext': 'mp4',
'protocol': 'm3u8_native',
'vcodec': 'none',
'audio_ext': 'mp4',
'video_ext': 'none',
}, {
'format_id': '1800',
'url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/chunklist_b1758000_vo_slita_t64MTgwMA==.m3u8',
'manifest_url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
'tbr': 1758.0,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 1024,
'height': 576,
'vcodec': 'avc1',
'acodec': 'none',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 1758.0,
}, {
'format_id': '2400',
'url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/chunklist_b2344000_vo_slita_t64MjQwMA==.m3u8',
'manifest_url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
'tbr': 2344.0,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 1280,
'height': 720,
'vcodec': 'avc1',
'acodec': 'none',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 2344.0,
}, {
'format_id': '3600',
'url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/chunklist_b3516000_vo_slita_t64MzYwMA==.m3u8',
'manifest_url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
'tbr': 3516.0,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 1440,
'height': 810,
'vcodec': 'avc1',
'acodec': 'none',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 3516.0,
}, {
'format_id': '5000',
'url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/chunklist_b5210000_vo_slita_t64NTAwMA==.m3u8',
'manifest_url': 'https://b70cb04c54ab478189e9d8ee45637b13.msvdn.net/ostr8/podcastcdn/teche_root/YT_ITALIA_TECHE_HD_multiaudio/13834457_,1200,1800,2400,3600,5000/playlist.m3u8?auth=daEaKdPbObBcsaIbabWdqd2adavcddNdsaK-bJQVtj-c0-GmsvrFu&aifp=V001',
'tbr': 5210.0,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 1920,
'height': 1080,
'vcodec': 'avc1',
'acodec': 'none',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 5210.0,
}],
{},
{'acodec': 'mp4a', 'vcodec': 'avc1'},
),
(
'rai-livestream-48cc9aec-d6f0-4e53-843e-23565b24cd82-playlist',
'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
[{
'format_id': 'aac-Audiodescrizione',
'format_note': 'Audiodescrizione',
'url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/desrai1_160/chunklist_ao.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56',
'manifest_url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
'language': 'des',
'ext': 'mp4',
'protocol': 'm3u8_native',
'vcodec': 'none',
'audio_ext': 'mp4',
'video_ext': 'none',
}, {
'format_id': 'aac-Italiano',
'format_note': 'Italiano',
'url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/itarai1_160/chunklist_ao.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56',
'manifest_url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
'language': 'ita',
'ext': 'mp4',
'protocol': 'm3u8_native',
'vcodec': 'none',
'audio_ext': 'mp4',
'video_ext': 'none',
}, {
'format_id': 'aac-Lingua Originale',
'format_note': 'Lingua Originale',
'url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/engrai1_160/chunklist_ao.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56',
'manifest_url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
'language': 'V.O',
'ext': 'mp4',
'protocol': 'm3u8_native',
'vcodec': 'none',
'audio_ext': 'mp4',
'video_ext': 'none',
}, {
'format_id': '1365',
'url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/rai1_1200/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56',
'manifest_url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
'tbr': 1365.331,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 768,
'height': 432,
'vcodec': 'avc1.77.31',
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 1365.331,
}, {
'format_id': '2137',
'url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/rai1_1800/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56',
'manifest_url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
'tbr': 2137.033,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 1024,
'height': 576,
'vcodec': 'avc1.77.31',
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 2137.033,
}, {
'format_id': '2793',
'url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/rai1_2400/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56',
'manifest_url': 'https://streamcdng18-8e7439fdb1694c8da3a0fd63e4dda518.msvdn.net/raiuno1/hls/playlist_ma.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672352713&tof=86400&tk2=ecc6060eb5e4fc27a07f695c90fccfc9cb1ced2bac8f5d9b7442e884912b63c3',
'tbr': 2793.078,
'ext': 'mp4',
'protocol': 'm3u8_native',
'width': 1280,
'height': 720,
'vcodec': 'avc1.77.41',
'acodec': 'mp4a.40.2',
'video_ext': 'mp4',
'audio_ext': 'none',
'vbr': 2793.078,
}],
{},
{'acodec': 'mp4a', 'vcodec': 'avc1'},
),
]
for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
for m3u8_file, m3u8_url, expected_formats, expected_subs, codecs in _TEST_CASES:
with open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, encoding='utf-8') as f:
formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
f.read(), m3u8_url, ext='mp4')
f.read(), m3u8_url, ext='mp4', force_codecs=codecs)
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subs, expected_subs, None)

View file

@ -0,0 +1,12 @@
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-STREAM-INF:BANDWIDTH=1758000,NAME="1800",RESOLUTION=1024x576,AUDIO="aac"
chunklist_b1758000_vo_slita_t64MTgwMA==.m3u8
#EXT-X-STREAM-INF:BANDWIDTH=2344000,NAME="2400",RESOLUTION=1280x720,AUDIO="aac"
chunklist_b2344000_vo_slita_t64MjQwMA==.m3u8
#EXT-X-STREAM-INF:BANDWIDTH=3516000,NAME="3600",RESOLUTION=1440x810,AUDIO="aac"
chunklist_b3516000_vo_slita_t64MzYwMA==.m3u8
#EXT-X-STREAM-INF:BANDWIDTH=5210000,NAME="5000",RESOLUTION=1920x1080,AUDIO="aac"
chunklist_b5210000_vo_slita_t64NTAwMA==.m3u8
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="Italiano",NAME="Italiano",DEFAULT=YES,AUTOSELECT=YES,URI="chunklist_b192400_ao_slItaliano_t64SXRhbGlhbm9fYXVkaW8=.m3u8"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="Audiodescrizione",NAME="Audiodescrizione",DEFAULT=NO,AUTOSELECT=YES,URI="chunklist_b192400_ao_slAudiodescrizione_t64QXVkaW9kZXNjcml6aW9uZV9hdWRpbw==.m3u8"

View file

@ -0,0 +1,14 @@
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="ita",NAME="Italiano",DEFAULT=YES,AUTOSELECT=YES,URI="itarai1_160/chunklist_ao.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="V.O",NAME="Lingua Originale",DEFAULT=NO,AUTOSELECT=YES,URI="engrai1_160/chunklist_ao.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56"
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="des",NAME="Audiodescrizione",DEFAULT=NO,AUTOSELECT=YES,URI="desrai1_160/chunklist_ao.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2793078,CODECS="avc1.77.41,mp4a.40.2",RESOLUTION=1280x720,AUDIO="aac"
rai1_2400/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56
#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2793078,CODECS="avc1.77.42",URI="rai1_2400/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2137033,CODECS="avc1.77.31,mp4a.40.2",RESOLUTION=1024x576,AUDIO="aac"
rai1_1800/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56
#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2137033,CODECS="avc1.77.42",URI="rai1_1800/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56"
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1365331,CODECS="avc1.77.31,mp4a.40.2",RESOLUTION=768x432,AUDIO="aac"
rai1_1200/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56
#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1365331,CODECS="avc1.77.31,mp4a.40.2",URI="rai1_1200/chunklist.m3u8?baseuri=%2Fraiuno1%2Fhls%2F&tstart=0&tend=1672438656&tk2=2b4cb4fd233734fe415b44d8bdb422941028f6314c0aa7088d8fed35c72edf56"

View file

@ -1909,7 +1909,7 @@ def _extract_m3u8_formats_and_subtitles(
self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
preference=None, quality=None, m3u8_id=None, note=None,
errnote=None, fatal=True, live=False, data=None, headers={},
query={}):
query={}, force_codecs={}):
if self.get_param('ignore_no_formats_error'):
fatal = False
@ -1938,13 +1938,13 @@ def _extract_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
preference=preference, quality=quality, m3u8_id=m3u8_id,
note=note, errnote=errnote, fatal=fatal, live=live, data=data,
headers=headers, query=query, video_id=video_id)
headers=headers, query=query, video_id=video_id, force_codecs=force_codecs)
def _parse_m3u8_formats_and_subtitles(
self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
preference=None, quality=None, m3u8_id=None, live=False, note=None,
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
video_id=None, force_codecs={}):
formats, subtitles = [], {}
has_drm = re.search('|'.join([
@ -2125,7 +2125,6 @@ def build_stream_name():
'abr': abr,
})
codecs = parse_codecs(last_stream_inf.get('CODECS'))
f.update(codecs)
audio_group_id = last_stream_inf.get('AUDIO')
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
@ -2136,12 +2135,18 @@ def build_stream_name():
# (with audio and video) format. So, for such cases we will
# ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f.get('vcodec') != 'none':
def xor(x, y):
return bool((x and not y) or (not x and y))
if audio_group_id and xor(codecs, force_codecs) and f.get('vcodec') != 'none' and force_codecs.get('vcodec') != 'none':
audio_group = groups.get(audio_group_id)
if audio_group and audio_group[0].get('URI'):
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f['acodec'] = 'none'
codecs = codecs or force_codecs
codecs['acodec'] = 'none'
f.update(codecs)
if not f.get('ext'):
f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
formats.append(f)