Subtitle extraction from streaming media manifests #247

Authored by fstirlitz
Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144

Closes: #73
Fixes:
https://github.com/ytdl-org/youtube-dl/issues/6106
https://github.com/ytdl-org/youtube-dl/issues/14977
https://github.com/ytdl-org/youtube-dl/issues/21438
https://github.com/ytdl-org/youtube-dl/issues/23609
https://github.com/ytdl-org/youtube-dl/issues/28132

Might also fix (untested):
https://github.com/ytdl-org/youtube-dl/issues/15424
https://github.com/ytdl-org/youtube-dl/issues/18267
https://github.com/ytdl-org/youtube-dl/issues/23899
https://github.com/ytdl-org/youtube-dl/issues/24375
https://github.com/ytdl-org/youtube-dl/issues/24595
https://github.com/ytdl-org/youtube-dl/issues/27899

Related:
https://github.com/ytdl-org/youtube-dl/issues/22379
https://github.com/ytdl-org/youtube-dl/pull/24517
https://github.com/ytdl-org/youtube-dl/pull/24886
https://github.com/ytdl-org/youtube-dl/pull/27215

Notes:
* The functions `extractor.common._extract_..._formats` are still kept for compatibility
* Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles`
* Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats
* AES support is untested
* The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players
    * Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`.
        Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file
    * The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit>
* Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools
    * Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg`
    * The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac>
    * But validity of the those extracted from ISM are untested
This commit is contained in:
pukkandan 2021-04-28 19:02:43 +05:30 committed by GitHub
commit be6202f12b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
25 changed files with 2730 additions and 267 deletions

View file

@ -684,17 +684,186 @@ def test_parse_m3u8_formats(self):
'width': 1920,
'height': 1080,
'vcodec': 'avc1.64002a',
}]
}],
{}
),
(
'bipbop_16x9',
'https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8',
[{
"format_id": "bipbop_audio-BipBop Audio 2",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/alternate_audio_aac/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"language": "eng",
"ext": "mp4",
"protocol": "m3u8",
"preference": None,
"quality": None,
"vcodec": "none",
"audio_ext": "mp4",
"video_ext": "none",
}, {
"format_id": "41",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear0/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"tbr": 41.457,
"ext": "mp4",
"fps": None,
"protocol": "m3u8",
"preference": None,
"quality": None,
"vcodec": "none",
"acodec": "mp4a.40.2",
"audio_ext": "mp4",
"video_ext": "none",
"abr": 41.457,
}, {
"format_id": "263",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear1/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"tbr": 263.851,
"ext": "mp4",
"fps": None,
"protocol": "m3u8",
"preference": None,
"quality": None,
"width": 416,
"height": 234,
"vcodec": "avc1.4d400d",
"acodec": "mp4a.40.2",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 263.851,
"abr": 0,
}, {
"format_id": "577",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear2/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"tbr": 577.61,
"ext": "mp4",
"fps": None,
"protocol": "m3u8",
"preference": None,
"quality": None,
"width": 640,
"height": 360,
"vcodec": "avc1.4d401e",
"acodec": "mp4a.40.2",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 577.61,
"abr": 0,
}, {
"format_id": "915",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear3/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"tbr": 915.905,
"ext": "mp4",
"fps": None,
"protocol": "m3u8",
"preference": None,
"quality": None,
"width": 960,
"height": 540,
"vcodec": "avc1.4d401f",
"acodec": "mp4a.40.2",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 915.905,
"abr": 0,
}, {
"format_id": "1030",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear4/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"tbr": 1030.138,
"ext": "mp4",
"fps": None,
"protocol": "m3u8",
"preference": None,
"quality": None,
"width": 1280,
"height": 720,
"vcodec": "avc1.4d401f",
"acodec": "mp4a.40.2",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 1030.138,
"abr": 0,
}, {
"format_id": "1924",
"format_index": None,
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/gear5/prog_index.m3u8",
"manifest_url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/bipbop_16x9_variant.m3u8",
"tbr": 1924.009,
"ext": "mp4",
"fps": None,
"protocol": "m3u8",
"preference": None,
"quality": None,
"width": 1920,
"height": 1080,
"vcodec": "avc1.4d401f",
"acodec": "mp4a.40.2",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 1924.009,
"abr": 0,
}],
{
"en": [{
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}, {
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/eng_forced/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}],
"fr": [{
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}, {
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/fra_forced/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}],
"es": [{
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}, {
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/spa_forced/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}],
"ja": [{
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}, {
"url": "https://devstreaming-cdn.apple.com/videos/streaming/examples/bipbop_16x9/subtitles/jpn_forced/prog_index.m3u8",
"ext": "vtt",
"protocol": "m3u8_native"
}],
}
),
]
for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_m3u8_formats(
formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
f.read(), m3u8_url, ext='mp4')
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subs, expected_subs, None)
def test_parse_mpd_formats(self):
_TEST_CASES = [
@ -780,7 +949,8 @@ def test_parse_mpd_formats(self):
'tbr': 5997.485,
'width': 1920,
'height': 1080,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/pull/14844
'urls_only',
@ -863,7 +1033,8 @@ def test_parse_mpd_formats(self):
'tbr': 4400,
'width': 1920,
'height': 1080,
}]
}],
{},
), (
# https://github.com/ytdl-org/youtube-dl/issues/20346
# Media considered unfragmented even though it contains
@ -909,18 +1080,328 @@ def test_parse_mpd_formats(self):
'width': 360,
'height': 360,
'fps': 30,
}]
}],
{},
), (
'subtitles',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/',
[{
"format_id": "audio=128001",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"ext": "m4a",
"tbr": 128.001,
"asr": 48000,
"format_note": "DASH audio",
"container": "m4a_dash",
"vcodec": "none",
"acodec": "mp4a.40.2",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
"audio_ext": "m4a",
"video_ext": "none",
"abr": 128.001,
}, {
"format_id": "video=100000",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"ext": "mp4",
"width": 336,
"height": 144,
"tbr": 100,
"format_note": "DASH video",
"container": "mp4_dash",
"vcodec": "avc1.4D401F",
"acodec": "none",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 100,
}, {
"format_id": "video=326000",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"ext": "mp4",
"width": 562,
"height": 240,
"tbr": 326,
"format_note": "DASH video",
"container": "mp4_dash",
"vcodec": "avc1.4D401F",
"acodec": "none",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 326,
}, {
"format_id": "video=698000",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"ext": "mp4",
"width": 844,
"height": 360,
"tbr": 698,
"format_note": "DASH video",
"container": "mp4_dash",
"vcodec": "avc1.4D401F",
"acodec": "none",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 698,
}, {
"format_id": "video=1493000",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"ext": "mp4",
"width": 1126,
"height": 480,
"tbr": 1493,
"format_note": "DASH video",
"container": "mp4_dash",
"vcodec": "avc1.4D401F",
"acodec": "none",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 1493,
}, {
"format_id": "video=4482000",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"ext": "mp4",
"width": 1688,
"height": 720,
"tbr": 4482,
"format_note": "DASH video",
"container": "mp4_dash",
"vcodec": "avc1.4D401F",
"acodec": "none",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
"video_ext": "mp4",
"audio_ext": "none",
"vbr": 4482,
}],
{
"en": [
{
"ext": "mp4",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd",
"fragment_base_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/",
"protocol": "http_dash_segments",
}
]
},
)
]
for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES:
for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_mpd_formats(
formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
compat_etree_fromstring(f.read().encode('utf-8')),
mpd_base_url=mpd_base_url, mpd_url=mpd_url)
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subtitles, expected_subtitles, None)
def test_parse_ism_formats(self):
_TEST_CASES = [
(
'sintel',
'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
[{
"format_id": "audio-128",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"ext": "isma",
"tbr": 128,
"asr": 48000,
"vcodec": "none",
"acodec": "AACL",
"protocol": "ism",
"_download_params": {
"stream_type": "audio",
"duration": 8880746666,
"timescale": 10000000,
"width": 0,
"height": 0,
"fourcc": "AACL",
"codec_private_data": "1190",
"sampling_rate": 48000,
"channels": 2,
"bits_per_sample": 16,
"nal_unit_length_field": 4
},
"audio_ext": "isma",
"video_ext": "none",
"abr": 128,
}, {
"format_id": "video-100",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"ext": "ismv",
"width": 336,
"height": 144,
"tbr": 100,
"vcodec": "AVC1",
"acodec": "none",
"protocol": "ism",
"_download_params": {
"stream_type": "video",
"duration": 8880746666,
"timescale": 10000000,
"width": 336,
"height": 144,
"fourcc": "AVC1",
"codec_private_data": "00000001674D401FDA0544EFFC2D002CBC40000003004000000C03C60CA80000000168EF32C8",
"channels": 2,
"bits_per_sample": 16,
"nal_unit_length_field": 4
},
"video_ext": "ismv",
"audio_ext": "none",
"vbr": 100,
}, {
"format_id": "video-326",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"ext": "ismv",
"width": 562,
"height": 240,
"tbr": 326,
"vcodec": "AVC1",
"acodec": "none",
"protocol": "ism",
"_download_params": {
"stream_type": "video",
"duration": 8880746666,
"timescale": 10000000,
"width": 562,
"height": 240,
"fourcc": "AVC1",
"codec_private_data": "00000001674D401FDA0241FE23FFC3BC83BA44000003000400000300C03C60CA800000000168EF32C8",
"channels": 2,
"bits_per_sample": 16,
"nal_unit_length_field": 4
},
"video_ext": "ismv",
"audio_ext": "none",
"vbr": 326,
}, {
"format_id": "video-698",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"ext": "ismv",
"width": 844,
"height": 360,
"tbr": 698,
"vcodec": "AVC1",
"acodec": "none",
"protocol": "ism",
"_download_params": {
"stream_type": "video",
"duration": 8880746666,
"timescale": 10000000,
"width": 844,
"height": 360,
"fourcc": "AVC1",
"codec_private_data": "00000001674D401FDA0350BFB97FF06AF06AD1000003000100000300300F1832A00000000168EF32C8",
"channels": 2,
"bits_per_sample": 16,
"nal_unit_length_field": 4
},
"video_ext": "ismv",
"audio_ext": "none",
"vbr": 698,
}, {
"format_id": "video-1493",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"ext": "ismv",
"width": 1126,
"height": 480,
"tbr": 1493,
"vcodec": "AVC1",
"acodec": "none",
"protocol": "ism",
"_download_params": {
"stream_type": "video",
"duration": 8880746666,
"timescale": 10000000,
"width": 1126,
"height": 480,
"fourcc": "AVC1",
"codec_private_data": "00000001674D401FDA011C3DE6FFF0D890D871000003000100000300300F1832A00000000168EF32C8",
"channels": 2,
"bits_per_sample": 16,
"nal_unit_length_field": 4
},
"video_ext": "ismv",
"audio_ext": "none",
"vbr": 1493,
}, {
"format_id": "video-4482",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"ext": "ismv",
"width": 1688,
"height": 720,
"tbr": 4482,
"vcodec": "AVC1",
"acodec": "none",
"protocol": "ism",
"_download_params": {
"stream_type": "video",
"duration": 8880746666,
"timescale": 10000000,
"width": 1688,
"height": 720,
"fourcc": "AVC1",
"codec_private_data": "00000001674D401FDA01A816F97FFC1ABC1AB440000003004000000C03C60CA80000000168EF32C8",
"channels": 2,
"bits_per_sample": 16,
"nal_unit_length_field": 4
},
"video_ext": "ismv",
"audio_ext": "none",
"vbr": 4482,
}],
{
"eng": [
{
"ext": "ismt",
"protocol": "ism",
"url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"manifest_url": "https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest",
"_download_params": {
"stream_type": "text",
"duration": 8880746666,
"timescale": 10000000,
"fourcc": "TTML",
"codec_private_data": ""
}
}
]
},
),
]
for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES:
with io.open('./test/testdata/ism/%s.Manifest' % ism_file,
mode='r', encoding='utf-8') as f:
formats, subtitles = self.ie._parse_ism_formats_and_subtitles(
compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url)
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subtitles, expected_subtitles, None)
def test_parse_f4m_formats(self):
_TEST_CASES = [

988
test/testdata/ism/sintel.Manifest vendored Normal file
View file

@ -0,0 +1,988 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Created with Unified Streaming Platform (version=1.10.18-20255) -->
<SmoothStreamingMedia
MajorVersion="2"
MinorVersion="0"
TimeScale="10000000"
Duration="8880746666">
<StreamIndex
Type="audio"
QualityLevels="1"
TimeScale="10000000"
Name="audio"
Chunks="445"
Url="QualityLevels({bitrate})/Fragments(audio={start time})">
<QualityLevel
Index="0"
Bitrate="128001"
CodecPrivateData="1190"
SamplingRate="48000"
Channels="2"
BitsPerSample="16"
PacketSize="4"
AudioTag="255"
FourCC="AACL" />
<c t="0" d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="20053333" />
<c d="20053333" />
<c d="20053334" />
<c d="19840000" />
<c d="746666" />
</StreamIndex>
<StreamIndex
Type="text"
QualityLevels="1"
TimeScale="10000000"
Language="eng"
Subtype="CAPT"
Name="textstream_eng"
Chunks="11"
Url="QualityLevels({bitrate})/Fragments(textstream_eng={start time})">
<QualityLevel
Index="0"
Bitrate="1000"
CodecPrivateData=""
FourCC="TTML" />
<c t="0" d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="600000000" />
<c d="240000000" />
</StreamIndex>
<StreamIndex
Type="video"
QualityLevels="5"
TimeScale="10000000"
Name="video"
Chunks="444"
Url="QualityLevels({bitrate})/Fragments(video={start time})"
MaxWidth="1688"
MaxHeight="720"
DisplayWidth="1689"
DisplayHeight="720">
<QualityLevel
Index="0"
Bitrate="100000"
CodecPrivateData="00000001674D401FDA0544EFFC2D002CBC40000003004000000C03C60CA80000000168EF32C8"
MaxWidth="336"
MaxHeight="144"
FourCC="AVC1" />
<QualityLevel
Index="1"
Bitrate="326000"
CodecPrivateData="00000001674D401FDA0241FE23FFC3BC83BA44000003000400000300C03C60CA800000000168EF32C8"
MaxWidth="562"
MaxHeight="240"
FourCC="AVC1" />
<QualityLevel
Index="2"
Bitrate="698000"
CodecPrivateData="00000001674D401FDA0350BFB97FF06AF06AD1000003000100000300300F1832A00000000168EF32C8"
MaxWidth="844"
MaxHeight="360"
FourCC="AVC1" />
<QualityLevel
Index="3"
Bitrate="1493000"
CodecPrivateData="00000001674D401FDA011C3DE6FFF0D890D871000003000100000300300F1832A00000000168EF32C8"
MaxWidth="1126"
MaxHeight="480"
FourCC="AVC1" />
<QualityLevel
Index="4"
Bitrate="4482000"
CodecPrivateData="00000001674D401FDA01A816F97FFC1ABC1AB440000003004000000C03C60CA80000000168EF32C8"
MaxWidth="1688"
MaxHeight="720"
FourCC="AVC1" />
<c t="0" d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
<c d="20000000" />
</StreamIndex>
</SmoothStreamingMedia>

38
test/testdata/m3u8/bipbop_16x9.m3u8 vendored Normal file
View file

@ -0,0 +1,38 @@
#EXTM3U
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 1",AUTOSELECT=YES,DEFAULT=YES
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="bipbop_audio",LANGUAGE="eng",NAME="BipBop Audio 2",AUTOSELECT=NO,DEFAULT=NO,URI="alternate_audio_aac/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English",DEFAULT=YES,AUTOSELECT=YES,FORCED=NO,LANGUAGE="en",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/eng/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="English (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="en",URI="subtitles/eng_forced/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="fr",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/fra/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Français (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="fr",URI="subtitles/fra_forced/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="es",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/spa/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Español (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="es",URI="subtitles/spa_forced/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語",DEFAULT=NO,AUTOSELECT=YES,FORCED=NO,LANGUAGE="ja",CHARACTERISTICS="public.accessibility.transcribes-spoken-dialog, public.accessibility.describes-music-and-sound",URI="subtitles/jpn/prog_index.m3u8"
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="日本語 (Forced)",DEFAULT=NO,AUTOSELECT=NO,FORCED=YES,LANGUAGE="ja",URI="subtitles/jpn_forced/prog_index.m3u8"
#EXT-X-STREAM-INF:BANDWIDTH=263851,CODECS="mp4a.40.2, avc1.4d400d",RESOLUTION=416x234,AUDIO="bipbop_audio",SUBTITLES="subs"
gear1/prog_index.m3u8
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=28451,CODECS="avc1.4d400d",URI="gear1/iframe_index.m3u8"
#EXT-X-STREAM-INF:BANDWIDTH=577610,CODECS="mp4a.40.2, avc1.4d401e",RESOLUTION=640x360,AUDIO="bipbop_audio",SUBTITLES="subs"
gear2/prog_index.m3u8
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=181534,CODECS="avc1.4d401e",URI="gear2/iframe_index.m3u8"
#EXT-X-STREAM-INF:BANDWIDTH=915905,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=960x540,AUDIO="bipbop_audio",SUBTITLES="subs"
gear3/prog_index.m3u8
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=297056,CODECS="avc1.4d401f",URI="gear3/iframe_index.m3u8"
#EXT-X-STREAM-INF:BANDWIDTH=1030138,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1280x720,AUDIO="bipbop_audio",SUBTITLES="subs"
gear4/prog_index.m3u8
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=339492,CODECS="avc1.4d401f",URI="gear4/iframe_index.m3u8"
#EXT-X-STREAM-INF:BANDWIDTH=1924009,CODECS="mp4a.40.2, avc1.4d401f",RESOLUTION=1920x1080,AUDIO="bipbop_audio",SUBTITLES="subs"
gear5/prog_index.m3u8
#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=669554,CODECS="avc1.4d401f",URI="gear5/iframe_index.m3u8"
#EXT-X-STREAM-INF:BANDWIDTH=41457,CODECS="mp4a.40.2",AUDIO="bipbop_audio",SUBTITLES="subs"
gear0/prog_index.m3u8

351
test/testdata/mpd/subtitles.mpd vendored Normal file
View file

@ -0,0 +1,351 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Created with Unified Streaming Platform (version=1.10.18-20255) -->
<MPD
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="urn:mpeg:dash:schema:mpd:2011"
xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd"
type="static"
mediaPresentationDuration="PT14M48S"
maxSegmentDuration="PT1M"
minBufferTime="PT10S"
profiles="urn:mpeg:dash:profile:isoff-live:2011">
<Period
id="1"
duration="PT14M48S">
<BaseURL>dash/</BaseURL>
<AdaptationSet
id="1"
group="1"
contentType="audio"
segmentAlignment="true"
audioSamplingRate="48000"
mimeType="audio/mp4"
codecs="mp4a.40.2"
startWithSAP="1">
<AudioChannelConfiguration
schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011"
value="2" />
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="48000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="96256" r="2" />
<S d="95232" />
<S d="3584" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="audio=128001"
bandwidth="128001">
</Representation>
</AdaptationSet>
<AdaptationSet
id="2"
group="3"
contentType="text"
lang="en"
mimeType="application/mp4"
codecs="stpp"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="subtitle" />
<SegmentTemplate
timescale="1000"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="60000" r="9" />
<S d="24000" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="textstream_eng=1000"
bandwidth="1000">
</Representation>
</AdaptationSet>
<AdaptationSet
id="3"
group="2"
contentType="video"
par="960:409"
minBandwidth="100000"
maxBandwidth="4482000"
maxWidth="1689"
maxHeight="720"
segmentAlignment="true"
mimeType="video/mp4"
codecs="avc1.4D401F"
startWithSAP="1">
<Role schemeIdUri="urn:mpeg:dash:role:2011" value="main" />
<SegmentTemplate
timescale="12288"
initialization="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$.dash"
media="3144-kZT4LWMQw6Rh7Kpd-$RepresentationID$-$Time$.dash">
<SegmentTimeline>
<S t="0" d="24576" r="443" />
</SegmentTimeline>
</SegmentTemplate>
<Representation
id="video=100000"
bandwidth="100000"
width="336"
height="144"
sar="2880:2863"
scanType="progressive">
</Representation>
<Representation
id="video=326000"
bandwidth="326000"
width="562"
height="240"
sar="115200:114929"
scanType="progressive">
</Representation>
<Representation
id="video=698000"
bandwidth="698000"
width="844"
height="360"
sar="86400:86299"
scanType="progressive">
</Representation>
<Representation
id="video=1493000"
bandwidth="1493000"
width="1126"
height="480"
sar="230400:230267"
scanType="progressive">
</Representation>
<Representation
id="video=4482000"
bandwidth="4482000"
width="1688"
height="720"
sar="86400:86299"
scanType="progressive">
</Representation>
</AdaptationSet>
</Period>
</MPD>

View file

@ -3018,10 +3018,24 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
return ctypes.WINFUNCTYPE(*args, **kwargs)
try:
compat_Pattern = re.Pattern
except AttributeError:
compat_Pattern = type(re.compile(''))
try:
compat_Match = re.Match
except AttributeError:
compat_Match = type(re.compile('').match(''))
__all__ = [
'compat_HTMLParseError',
'compat_HTMLParser',
'compat_HTTPError',
'compat_Match',
'compat_Pattern',
'compat_Struct',
'compat_b64decode',
'compat_basestring',

View file

@ -77,7 +77,10 @@ def _read_ytdl_file(self, ctx):
assert 'ytdl_corrupt' not in ctx
stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
try:
ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
ytdl_data = json.loads(stream.read())
ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index']
if 'extra_state' in ytdl_data['downloader']:
ctx['extra_state'] = ytdl_data['downloader']['extra_state']
except Exception:
ctx['ytdl_corrupt'] = True
finally:
@ -90,6 +93,8 @@ def _write_ytdl_file(self, ctx):
'index': ctx['fragment_index'],
},
}
if 'extra_state' in ctx:
downloader['extra_state'] = ctx['extra_state']
if ctx.get('fragment_count') is not None:
downloader['fragment_count'] = ctx['fragment_count']
frag_index_stream.write(json.dumps({'downloader': downloader}))

View file

@ -2,6 +2,7 @@
import errno
import re
import io
import binascii
try:
from Crypto.Cipher import AES
@ -27,7 +28,9 @@
parse_m3u8_attributes,
sanitize_open,
update_url_query,
bug_reports_message,
)
from .. import webvtt
class HlsFD(FragmentFD):
@ -78,6 +81,8 @@ def real_download(self, filename, info_dict):
man_url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
is_webvtt = info_dict['ext'] == 'vtt'
urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
man_url = urlh.geturl()
s = urlh.read().decode('utf-8', 'ignore')
@ -142,6 +147,8 @@ def is_ad_fragment_end(s):
else:
self._prepare_and_start_frag_download(ctx)
extra_state = ctx.setdefault('extra_state', {})
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
test = self.params.get('test', False)
@ -308,6 +315,76 @@ def download_fragment(fragment):
return frag_content, frag_index
pack_fragment = lambda frag_content, _: frag_content
if is_webvtt:
def pack_fragment(frag_content, frag_index):
output = io.StringIO()
adjust = 0
for block in webvtt.parse_fragment(frag_content):
if isinstance(block, webvtt.CueBlock):
block.start += adjust
block.end += adjust
dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
cue = block.as_json
# skip the cue if an identical one appears
# in the window of potential duplicates
# and prune the window of unviable candidates
i = 0
skip = True
while i < len(dedup_window):
window_cue = dedup_window[i]
if window_cue == cue:
break
if window_cue['end'] >= cue['start']:
i += 1
continue
del dedup_window[i]
else:
skip = False
if skip:
continue
# add the cue to the window
dedup_window.append(cue)
elif isinstance(block, webvtt.Magic):
# take care of MPEG PES timestamp overflow
if block.mpegts is None:
block.mpegts = 0
extra_state.setdefault('webvtt_mpegts_adjust', 0)
block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
extra_state['webvtt_mpegts_adjust'] += 1
block.mpegts += 1 << 33
extra_state['webvtt_mpegts_last'] = block.mpegts
if frag_index == 1:
extra_state['webvtt_mpegts'] = block.mpegts or 0
extra_state['webvtt_local'] = block.local or 0
# XXX: block.local = block.mpegts = None ?
else:
if block.mpegts is not None and block.local is not None:
adjust = (
(block.mpegts - extra_state.get('webvtt_mpegts', 0))
- (block.local - extra_state.get('webvtt_local', 0))
)
continue
elif isinstance(block, webvtt.HeaderBlock):
if frag_index != 1:
# XXX: this should probably be silent as well
# or verify that all segments contain the same data
self.report_warning(bug_reports_message(
'Discarding a %s block found in the middle of the stream; '
'if the subtitles display incorrectly,'
% (type(block).__name__)))
continue
block.write_into(output)
return output.getvalue().encode('utf-8')
def append_fragment(frag_content, frag_index):
if frag_content:
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
@ -315,6 +392,7 @@ def append_fragment(frag_content, frag_index):
file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
ctx['fragment_filename_sanitized'] = frag_sanitized
file.close()
frag_content = pack_fragment(frag_content, frag_index)
self._append_fragment(ctx, frag_content)
return True
except EnvironmentError as ose:

View file

@ -48,7 +48,7 @@ def write_piff_header(stream, params):
language = params.get('language', 'und')
height = params.get('height', 0)
width = params.get('width', 0)
is_audio = width == 0 and height == 0
stream_type = params['stream_type']
creation_time = modification_time = int(time.time())
ftyp_payload = b'isml' # major brand
@ -77,7 +77,7 @@ def write_piff_header(stream, params):
tkhd_payload += u32.pack(0) * 2 # reserved
tkhd_payload += s16.pack(0) # layer
tkhd_payload += s16.pack(0) # alternate group
tkhd_payload += s88.pack(1 if is_audio else 0) # volume
tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0) # volume
tkhd_payload += u16.pack(0) # reserved
tkhd_payload += unity_matrix
tkhd_payload += u1616.pack(width)
@ -93,19 +93,34 @@ def write_piff_header(stream, params):
mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box
hdlr_payload = u32.pack(0) # pre defined
hdlr_payload += b'soun' if is_audio else b'vide' # handler type
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name
if stream_type == 'audio': # handler type
hdlr_payload += b'soun'
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += b'SoundHandler\0' # name
elif stream_type == 'video':
hdlr_payload += b'vide'
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += b'VideoHandler\0' # name
elif stream_type == 'text':
hdlr_payload += b'subt'
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += b'SubtitleHandler\0' # name
else:
assert False
mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box
if is_audio:
if stream_type == 'audio':
smhd_payload = s88.pack(0) # balance
smhd_payload += u16.pack(0) # reserved
media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header
else:
elif stream_type == 'video':
vmhd_payload = u16.pack(0) # graphics mode
vmhd_payload += u16.pack(0) * 3 # opcolor
media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header
elif stream_type == 'text':
media_header_box = full_box(b'sthd', 0, 0, b'') # Subtitle Media Header
else:
assert False
minf_payload = media_header_box
dref_payload = u32.pack(1) # entry count
@ -117,7 +132,7 @@ def write_piff_header(stream, params):
sample_entry_payload = u8.pack(0) * 6 # reserved
sample_entry_payload += u16.pack(1) # data reference index
if is_audio:
if stream_type == 'audio':
sample_entry_payload += u32.pack(0) * 2 # reserved
sample_entry_payload += u16.pack(params.get('channels', 2))
sample_entry_payload += u16.pack(params.get('bits_per_sample', 16))
@ -127,7 +142,7 @@ def write_piff_header(stream, params):
if fourcc == 'AACL':
sample_entry_box = box(b'mp4a', sample_entry_payload)
else:
elif stream_type == 'video':
sample_entry_payload += u16.pack(0) # pre defined
sample_entry_payload += u16.pack(0) # reserved
sample_entry_payload += u32.pack(0) * 3 # pre defined
@ -155,6 +170,18 @@ def write_piff_header(stream, params):
avcc_payload += pps
sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record
sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry
else:
assert False
elif stream_type == 'text':
if fourcc == 'TTML':
sample_entry_payload += b'http://www.w3.org/ns/ttml\0' # namespace
sample_entry_payload += b'\0' # schema location
sample_entry_payload += b'\0' # auxilary mime types(??)
sample_entry_box = box(b'stpp', sample_entry_payload)
else:
assert False
else:
assert False
stsd_payload += sample_entry_box
stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box
@ -221,10 +248,13 @@ def real_download(self, filename, info_dict):
self._prepare_and_start_frag_download(ctx)
extra_state = ctx.setdefault('extra_state', {
'ism_track_written': False,
})
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
track_written = False
frag_index = 0
for i, segment in enumerate(segments):
frag_index += 1
@ -236,11 +266,11 @@ def real_download(self, filename, info_dict):
success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
if not track_written:
if not extra_state['ism_track_written']:
tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
track_written = True
extra_state['ism_track_written'] = True
self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:

View file

@ -86,18 +86,19 @@ def _real_extract(self, url):
title = episode['titulo']
formats = []
subtitles = {}
for source in episode.get('sources', []):
src = source.get('src')
if not src:
continue
src_type = source.get('type')
if src_type == 'application/vnd.apple.mpegurl':
formats.extend(self._extract_m3u8_formats(
formats, subtitles = self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
m3u8_id='hls', fatal=False)
elif src_type == 'application/dash+xml':
formats.extend(self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False))
formats, subtitles = self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False)
self._sort_formats(formats)
heartbeat = episode.get('heartbeat') or {}
@ -115,4 +116,5 @@ def _real_extract(self, url):
'channel': get_meta('channel'),
'season': get_meta('season'),
'episode_number': int_or_none(get_meta('episodeNumber')),
'subtitles': subtitles,
}

View file

@ -82,6 +82,7 @@ def _real_extract(self, url):
info = {}
formats = []
subtitles = {}
for format_id, ep in video.items():
if not isinstance(ep, dict):
continue
@ -90,12 +91,16 @@ def _real_extract(self, url):
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
m3u8_id='hls', fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
video_url, video_id, mpd_id='dash', fatal=False)
formats.extend(mpd_fmts)
subtitles = self._merge_subtitles(subtitles, mpd_subs)
else:
formats.append({
'url': video_url,
@ -114,4 +119,5 @@ def _real_extract(self, url):
'display_id': display_id,
'title': display_id,
'formats': formats,
'subtitles': subtitles,
})

View file

@ -83,24 +83,31 @@ def _real_extract(self, url):
description = data.get('description')
formats = []
subtitles = {}
for target in data['targetUrls']:
format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
if not format_url or not format_type:
continue
format_type = format_type.upper()
if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
formats.extend(self._extract_m3u8_formats(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
m3u8_id=format_type, fatal=False))
m3u8_id=format_type, fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_type, fatal=False))
elif format_type == 'MPEG_DASH':
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id=format_type, fatal=False))
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id=format_type, fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HSS':
formats.extend(self._extract_ism_formats(
format_url, video_id, ism_id='mss', fatal=False))
fmts, subs = self._extract_ism_formats_and_subtitles(
format_url, video_id, ism_id='mss', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
else:
formats.append({
'format_id': format_type,
@ -108,7 +115,6 @@ def _real_extract(self, url):
})
self._sort_formats(formats)
subtitles = {}
subtitle_urls = data.get('subtitleUrls')
if isinstance(subtitle_urls, list):
for subtitle in subtitle_urls:

View file

@ -1879,11 +1879,21 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m
'format_note': 'Quality selection URL',
}
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None, quality=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False, data=None, headers={},
query={}):
def _extract_m3u8_formats(self, *args, **kwargs):
fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
if subs:
self.report_warning(bug_reports_message(
"Ignoring subtitle tracks found in the HLS manifest; "
"if any subtitle tracks are missing,"
))
return fmts
def _extract_m3u8_formats_and_subtitles(
self, m3u8_url, video_id, ext=None, entry_protocol='m3u8',
preference=None, quality=None, m3u8_id=None, note=None,
errnote=None, fatal=True, live=False, data=None, headers={},
query={}):
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
@ -1891,30 +1901,34 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
return [], {}
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
return self._parse_m3u8_formats(
return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
preference=preference, quality=quality, m3u8_id=m3u8_id,
note=note, errnote=errnote, fatal=fatal, live=live, data=data,
headers=headers, query=query, video_id=video_id)
def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
entry_protocol='m3u8', preference=None, quality=None,
m3u8_id=None, live=False, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}, video_id=None):
def _parse_m3u8_formats_and_subtitles(
self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8',
preference=None, quality=None, m3u8_id=None, live=False, note=None,
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
return [], {}
if (not self._downloader.params.get('allow_unplayable_formats')
and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
return []
return [], {}
formats = []
subtitles = {}
format_url = lambda u: (
u
if re.match(r'^https?://', u)
@ -2001,7 +2015,7 @@ def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None
}
formats.append(f)
return formats
return formats, subtitles
groups = {}
last_stream_inf = {}
@ -2013,6 +2027,21 @@ def extract_media(x_media_line):
if not (media_type and group_id and name):
return
groups.setdefault(group_id, []).append(media)
# <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
if media_type == 'SUBTITLES':
lang = media['LANGUAGE'] # XXX: normalise?
url = format_url(media['URI'])
sub_info = {
'url': url,
'ext': determine_ext(url),
}
if sub_info['ext'] == 'm3u8':
# Per RFC 8216 §3.1, the only possible subtitle format m3u8
# files may contain is WebVTT:
# <https://tools.ietf.org/html/rfc8216#section-3.1>
sub_info['ext'] = 'vtt'
sub_info['protocol'] = 'm3u8_native'
subtitles.setdefault(lang, []).append(sub_info)
if media_type not in ('VIDEO', 'AUDIO'):
return
media_url = media.get('URI')
@ -2160,7 +2189,7 @@ def build_stream_name():
formats.append(http_f)
last_stream_inf = {}
return formats
return formats, subtitles
@staticmethod
def _xpath_ns(path, namespace=None):
@ -2403,23 +2432,44 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
})
return entries
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
def _extract_mpd_formats(self, *args, **kwargs):
fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self.report_warning(bug_reports_message(
"Ignoring subtitle tracks found in the DASH manifest; "
"if any subtitle tracks are missing,"
))
return fmts
def _extract_mpd_formats_and_subtitles(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest',
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
return [], {}
mpd_doc, urlh = res
if mpd_doc is None:
return []
return [], {}
mpd_base_url = base_url(urlh.geturl())
return self._parse_mpd_formats(
return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
self.report_warning(bug_reports_message(
"Ignoring subtitle tracks found in the DASH manifest; "
"if any subtitle tracks are missing,"
))
return fmts
def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@ -2429,7 +2479,7 @@ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None
"""
if not self._downloader.params.get('dynamic_mpd', True):
if mpd_doc.get('type') == 'dynamic':
return []
return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@ -2501,6 +2551,7 @@ def extract_Initialization(source):
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = []
subtitles = {}
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
@ -2518,11 +2569,9 @@ def extract_Initialization(source):
representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
content_type = mime_type.split('/')[0]
if content_type == 'text':
# TODO implement WebVTT downloading
pass
elif content_type in ('video', 'audio'):
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
if content_type in ('video', 'audio', 'text'):
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
@ -2539,21 +2588,28 @@ def extract_Initialization(source):
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
}
f.update(parse_codecs(representation_attrib.get('codecs')))
if content_type in ('video', 'audio'):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
}
f.update(parse_codecs(representation_attrib.get('codecs')))
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
'manifest_url': mpd_url,
'filesize': filesize,
}
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
@ -2700,26 +2756,38 @@ def add_segment_url():
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
formats.append(f)
if content_type in ('video', 'audio'):
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
return formats, subtitles
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
if subs:
self.report_warning(bug_reports_message(
"Ignoring subtitle tracks found in the ISM manifest; "
"if any subtitle tracks are missing,"
))
return fmts
def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
ism_url, video_id,
note=note or 'Downloading ISM manifest',
errnote=errnote or 'Failed to download ISM manifest',
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
return [], {}
ism_doc, urlh = res
if ism_doc is None:
return []
return [], {}
return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""
Parse formats from ISM manifest.
References:
@ -2727,26 +2795,28 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
https://msdn.microsoft.com/en-us/library/ff469518.aspx
"""
if ism_doc.get('IsLive') == 'TRUE':
return []
return [], {}
if (not self._downloader.params.get('allow_unplayable_formats')
and ism_doc.find('Protection') is not None):
return []
return [], {}
duration = int(ism_doc.attrib['Duration'])
timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
formats = []
subtitles = {}
for stream in ism_doc.findall('StreamIndex'):
stream_type = stream.get('Type')
if stream_type not in ('video', 'audio'):
if stream_type not in ('video', 'audio', 'text'):
continue
url_pattern = stream.attrib['Url']
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
stream_name = stream.get('Name')
stream_language = stream.get('Language', 'und')
for track in stream.findall('QualityLevel'):
fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
# TODO: add support for WVC1 and WMAP
if fourcc not in ('H264', 'AVC1', 'AACL'):
if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
@ -2789,33 +2859,52 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
format_id.append(stream_name)
format_id.append(compat_str(tbr))
formats.append({
'format_id': '-'.join(format_id),
'url': ism_url,
'manifest_url': ism_url,
'ext': 'ismv' if stream_type == 'video' else 'isma',
'width': width,
'height': height,
'tbr': tbr,
'asr': sampling_rate,
'vcodec': 'none' if stream_type == 'audio' else fourcc,
'acodec': 'none' if stream_type == 'video' else fourcc,
'protocol': 'ism',
'fragments': fragments,
'_download_params': {
'duration': duration,
'timescale': stream_timescale,
'width': width or 0,
'height': height or 0,
'fourcc': fourcc,
'codec_private_data': track.get('CodecPrivateData'),
'sampling_rate': sampling_rate,
'channels': int_or_none(track.get('Channels', 2)),
'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
},
})
return formats
if stream_type == 'text':
subtitles.setdefault(stream_language, []).append({
'ext': 'ismt',
'protocol': 'ism',
'url': ism_url,
'manifest_url': ism_url,
'fragments': fragments,
'_download_params': {
'stream_type': stream_type,
'duration': duration,
'timescale': stream_timescale,
'fourcc': fourcc,
'language': stream_language,
'codec_private_data': track.get('CodecPrivateData'),
}
})
elif stream_type in ('video', 'audio'):
formats.append({
'format_id': '-'.join(format_id),
'url': ism_url,
'manifest_url': ism_url,
'ext': 'ismv' if stream_type == 'video' else 'isma',
'width': width,
'height': height,
'tbr': tbr,
'asr': sampling_rate,
'vcodec': 'none' if stream_type == 'audio' else fourcc,
'acodec': 'none' if stream_type == 'video' else fourcc,
'protocol': 'ism',
'fragments': fragments,
'_download_params': {
'stream_type': stream_type,
'duration': duration,
'timescale': stream_timescale,
'width': width or 0,
'height': height or 0,
'fourcc': fourcc,
'language': stream_language,
'codec_private_data': track.get('CodecPrivateData'),
'sampling_rate': sampling_rate,
'channels': int_or_none(track.get('Channels', 2)),
'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
},
})
return formats, subtitles
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
def absolute_url(item_url):
@ -2940,7 +3029,16 @@ def _media_formats(src, cur_media_type, type_info={}):
entries.append(media_info)
return entries
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
def _extract_akamai_formats(self, *args, **kwargs):
fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
if subs:
self.report_warning(bug_reports_message(
"Ignoring subtitle tracks found in the manifests; "
"if any subtitle tracks are missing,"
))
return fmts
def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
signed = 'hdnea=' in manifest_url
if not signed:
# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
@ -2949,6 +3047,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
'', manifest_url).strip('?')
formats = []
subtitles = {}
hdcore_sign = 'hdcore=3.7.0'
f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
@ -2967,10 +3066,11 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
hls_host = hosts.get('hls')
if hls_host:
m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
m3u8_formats = self._extract_m3u8_formats(
m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
formats.extend(m3u8_formats)
subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
http_host = hosts.get('http')
if http_host and m3u8_formats and not signed:
@ -2994,7 +3094,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats.append(http_f)
i += 1
return formats
return formats, subtitles
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
query = compat_urlparse.urlparse(url).query
@ -3319,12 +3419,22 @@ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
return ret
@classmethod
def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
""" Merge two subtitle dictionaries, language by language. """
ret = dict(subtitle_dict1)
for lang in subtitle_dict2:
ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
return ret
def _merge_subtitles(cls, *dicts, **kwargs):
""" Merge subtitle dictionaries, language by language. """
target = (lambda target=None: target)(**kwargs)
# The above lambda extracts the keyword argument 'target' from kwargs
# while ensuring there are no stray ones. When Python 2 support
# is dropped, remove it and change the function signature to:
#
# def _merge_subtitles(cls, *dicts, target=None):
if target is None:
target = {}
for d in dicts:
for lang, subs in d.items():
target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
return target
def extract_automatic_captions(self, *args, **kwargs):
if (self._downloader.params.get('writeautomaticsub', False)

View file

@ -1,9 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import os
import re
import tempfile
from .common import InfoExtractor
from ..utils import (
@ -12,12 +10,12 @@
try_get,
)
from ..compat import compat_str
from ..downloader.hls import HlsFD
class ElonetIE(InfoExtractor):
_VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'
_TEST = {
_TESTS = [{
# m3u8 with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
'md5': '8efc954b96c543711707f87de757caea',
'info_dict': {
@ -27,62 +25,17 @@ class ElonetIE(InfoExtractor):
'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
},
}
def _download_m3u8_chunked_subtitle(self, chunklist_url):
"""
Download VTT subtitles from pieces in manifest URL.
Return a string containing joined chunks with extra headers removed.
"""
with tempfile.NamedTemporaryFile(delete=True) as outfile:
fname = outfile.name
hlsdl = HlsFD(self._downloader, {})
hlsdl.download(compat_str(fname), {"url": chunklist_url})
with open(fname, 'r') as fin:
# Remove (some) headers
fdata = re.sub(r'X-TIMESTAMP-MAP.*\n+|WEBVTT\n+', '', fin.read())
os.remove(fname)
return "WEBVTT\n\n" + fdata
def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
"""
Parse subtitles from HLS / m3u8 manifest.
"""
subtitles = {}
baseurl = m3u8_url[:m3u8_url.rindex('/') + 1]
for line in m3u8_doc.split('\n'):
if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
lang = self._search_regex(
r'LANGUAGE="(.+?)"', line, 'lang', default=False)
uri = self._search_regex(
r'URI="(.+?)"', line, 'uri', default=False)
if lang and uri:
data = self._download_m3u8_chunked_subtitle(baseurl + uri)
subtitles[lang] = [{'ext': 'vtt', 'data': data}]
return subtitles
def _parse_mpd_subtitles(self, mpd_doc):
"""
Parse subtitles from MPD manifest.
"""
ns = '{urn:mpeg:dash:schema:mpd:2011}'
subtitles = {}
for aset in mpd_doc.findall(".//%sAdaptationSet[@mimeType='text/vtt']" % (ns)):
lang = aset.attrib.get('lang', 'unk')
url = aset.find("./%sRepresentation/%sBaseURL" % (ns, ns)).text
subtitles[lang] = [{'ext': 'vtt', 'url': url}]
return subtitles
def _get_subtitles(self, fmt, doc, url):
if fmt == 'm3u8':
subs = self._parse_m3u8_subtitles(doc, url)
elif fmt == 'mpd':
subs = self._parse_mpd_subtitles(doc)
else:
self.report_warning(
"Cannot download subtitles from '%s' streams." % (fmt))
subs = {}
return subs
}, {
# DASH with subtitles
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539',
'info_dict': {
'id': '116539',
'ext': 'mp4',
'title': 'Minulla on tiikeri',
'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...',
'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
@ -101,8 +54,8 @@ def _real_extract(self, url):
self._parse_json(json_s, video_id),
lambda x: x[0]["src"], compat_str)
formats = []
subtitles = {}
if re.search(r'\.m3u8\??', src):
fmt = 'm3u8'
res = self._download_webpage_handle(
# elonet servers have certificate problems
src.replace('https:', 'http:'), video_id,
@ -111,11 +64,10 @@ def _real_extract(self, url):
if res:
doc, urlh = res
url = urlh.geturl()
formats = self._parse_m3u8_formats(doc, url)
formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url)
for f in formats:
f['ext'] = 'mp4'
elif re.search(r'\.mpd\??', src):
fmt = 'mpd'
res = self._download_xml_handle(
src, video_id,
note='Downloading MPD manifest',
@ -123,7 +75,7 @@ def _real_extract(self, url):
if res:
doc, urlh = res
url = base_url(urlh.geturl())
formats = self._parse_mpd_formats(doc, mpd_base_url=url)
formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url)
else:
raise ExtractorError("Unknown streaming format")
@ -133,5 +85,5 @@ def _real_extract(self, url):
'description': description,
'thumbnail': thumbnail,
'formats': formats,
'subtitles': self.extract_subtitles(fmt, doc, url),
'subtitles': subtitles,
}

View file

@ -151,6 +151,7 @@ def sign(manifest_url, manifest_id):
videos.append(fallback_info['video'])
formats = []
subtitles = {}
for video in videos:
video_url = video.get('url')
if not video_url:
@ -171,10 +172,12 @@ def sign(manifest_url, manifest_id):
sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
video_id, f4m_id=format_id, fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False))
fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
@ -199,13 +202,12 @@ def sign(manifest_url, manifest_id):
title += ' - %s' % subtitle
title = title.strip()
subtitles = {}
subtitles_list = [{
'url': subformat['url'],
'ext': subformat.get('format'),
} for subformat in info.get('subtitles', []) if subformat.get('url')]
if subtitles_list:
subtitles['fr'] = subtitles_list
subtitles.setdefault('fr', []).extend(
[{
'url': subformat['url'],
'ext': subformat.get('format'),
} for subformat in info.get('subtitles', []) if subformat.get('url')]
)
return {
'id': video_id,

View file

@ -2444,8 +2444,9 @@ def _real_extract(self, url):
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
format_id = compat_str(m.group('format_id'))
subtitles = {}
if format_id.endswith('mpegurl'):
formats = self._extract_m3u8_formats(url, video_id, 'mp4')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
elif format_id == 'f4m':
formats = self._extract_f4m_formats(url, video_id)
else:
@ -2457,6 +2458,7 @@ def _real_extract(self, url):
info_dict['direct'] = True
self._sort_formats(formats)
info_dict['formats'] = formats
info_dict['subtitles'] = subtitles
return info_dict
if not self._downloader.params.get('test', False) and not is_intentional:
@ -2510,7 +2512,7 @@ def _real_extract(self, url):
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif doc.tag == 'SmoothStreamingMedia':
info_dict['formats'] = self._parse_ism_formats(doc, url)
info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@ -2524,7 +2526,7 @@ def _real_extract(self, url):
xspf_base_url=full_response.geturl()),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'] = self._parse_mpd_formats(
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)

View file

@ -46,6 +46,7 @@ def get_file_size(file_size):
urls = []
formats = []
subtitles = {}
for video in video_data.get('renditions', []):
video_url = video.get('url')
format_id = video.get('type')
@ -54,9 +55,11 @@ def get_file_size(file_size):
urls.append(video_url)
ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id or 'hls', fatal=False))
m3u8_id=format_id or 'hls', fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
continue
# formats.extend(self._extract_mpd_formats(
@ -96,6 +99,7 @@ def get_file_size(file_size):
'uploader': video_data.get('byline'),
'duration': float_or_none(video_data.get('duration'), 1000),
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
}

View file

@ -103,7 +103,7 @@ def _real_extract(self, url):
api_episode_url + '/videos', display_id,
'Downloading video JSON metadata')['data'][0]
m3u8_url = video_data['attributes']['url']
subtitle_m3u8_url = video_data['links']['download']
# XXX: additional URL at video_data['links']['download']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
@ -111,7 +111,7 @@ def _real_extract(self, url):
'%s is only available for FIRST members' % display_id)
raise
formats = self._extract_m3u8_formats(
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls')
self._sort_formats(formats)
@ -134,33 +134,6 @@ def _real_extract(self, url):
'url': img_url,
})
subtitles = {}
res = self._download_webpage_handle(
subtitle_m3u8_url, display_id,
'Downloading m3u8 information',
'Failed to download m3u8 information',
fatal=True, data=None, headers={}, query={})
if res is not False:
subtitle_m3u8_doc, _ = res
for line in subtitle_m3u8_doc.split('\n'):
if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
parts = line.split(',')
for part in parts:
if 'LANGUAGE' in part:
lang = part[part.index('=') + 2:-1]
elif 'URI' in part:
uri = part[part.index('=') + 2:-1]
res = self._download_webpage_handle(
uri, display_id,
'Downloading m3u8 information',
'Failed to download m3u8 information',
fatal=True, data=None, headers={}, query={})
doc, _ = res
for l in doc.split('\n'):
if not l.startswith('#'):
subtitles[lang] = [{'url': uri[:-uri[::-1].index('/')] + l}]
break
return {
'id': video_id,
'display_id': display_id,

View file

@ -87,6 +87,7 @@ def _real_extract(self, url):
title = media_data['title']
formats = []
subtitles = {}
q = qualities(['SD', 'HD'])
for source in (media_data.get('resourceList') or []):
format_url = source.get('url')
@ -104,12 +105,16 @@ def _real_extract(self, url):
if source.get('tokenType') == 'AKAMAI':
format_url = self._get_tokenized_src(
format_url, media_id, format_id)
formats.extend(self._extract_akamai_formats(
format_url, media_id))
fmts, subs = self._extract_akamai_formats_and_subtitles(
format_url, media_id)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif protocol == 'HLS':
formats.extend(self._extract_m3u8_formats(
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
format_url, media_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False))
m3u8_id=format_id, fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif protocol in ('HTTP', 'HTTPS'):
formats.append({
'format_id': format_id,
@ -133,7 +138,6 @@ def _real_extract(self, url):
})
self._sort_formats(formats)
subtitles = {}
if media_type == 'video':
for sub in (media_data.get('subtitleList') or []):
sub_url = sub.get('url')

View file

@ -99,16 +99,21 @@ def _real_extract(self, url):
aspect = float_or_none(config.get('aspect'))
formats = []
subtitles = {}
for source_type, source in (config.get('sources') or {}).items():
if not source:
continue
if source_type == 'dash':
formats.extend(self._extract_mpd_formats(
source, video_id, mpd_id='mpd', fatal=False))
fmts, subs = self._extract_mpd_formats_and_subtitles(
source, video_id, mpd_id='mpd', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'hls':
formats.extend(self._extract_m3u8_formats(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
m3u8_id='hls', fatal=False))
m3u8_id='hls', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif source_type == 'progressive':
for s in source:
src = s.get('src')
@ -138,7 +143,6 @@ def _real_extract(self, url):
# behaviour is being kept as-is
self._sort_formats(formats, ('res', 'source_preference'))
subtitles = {}
for subtitle in (config.get('subtitles') or []):
src = subtitle.get('src')
if not src:

View file

@ -93,18 +93,31 @@ def _real_extract(self, url):
'device': 'browser',
'protocol': 'hls',
})['playbackItem']['manifestUrl']
formats = self._extract_m3u8_formats(
formats = []
subtitles = {}
fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
formats.extend(self._extract_mpd_formats(
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
fmts, subs = self._extract_mpd_formats_and_subtitles(
manifest_url.replace('.m3u8', '.mpd'),
video_id, mpd_id='dash', fatal=False))
formats.extend(self._extract_f4m_formats(
video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
fmts = self._extract_f4m_formats(
manifest_url.replace('.m3u8', '.f4m'),
video_id, f4m_id='hds', fatal=False))
formats.extend(self._extract_ism_formats(
video_id, f4m_id='hds', fatal=False)
formats.extend(fmts)
fmts, subs = self._extract_ism_formats_and_subtitles(
re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url),
video_id, ism_id='mss', fatal=False))
video_id, ism_id='mss', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
if not formats and info.get('is_geo_restricted'):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
@ -115,7 +128,7 @@ def _real_extract(self, url):
'id': video_id,
'title': title,
'formats': formats,
# 'subtitles': subtitles,
'subtitles': subtitles,
'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': int_or_none(info.get('duration')),

View file

@ -36,9 +36,9 @@ class TwitterBaseIE(InfoExtractor):
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
if not variant_url:
return []
return [], {}
elif '.m3u8' in variant_url:
return self._extract_m3u8_formats(
return self._extract_m3u8_formats_and_subtitles(
variant_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
else:
@ -49,22 +49,27 @@ def _extract_variant_formats(self, variant, video_id):
'tbr': tbr,
}
self._search_dimensions_in_video_url(f, variant_url)
return [f]
return [f], {}
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id)
formats = []
subtitles = {}
urls = []
for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
video_variant.attrib['url'] = compat_urllib_parse_unquote(
video_variant.attrib['url'])
urls.append(video_variant.attrib['url'])
formats.extend(self._extract_variant_formats(
video_variant.attrib, video_id))
fmts, subs = self._extract_variant_formats(
video_variant.attrib, video_id)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
if video_url not in urls:
formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
return formats
fmts, subs = self._extract_variant_formats({'url': video_url}, video_id)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
return formats, subtitles
@staticmethod
def _search_dimensions_in_video_url(a_format, video_url):
@ -471,8 +476,11 @@ def extract_from_video_info(media):
video_info = media.get('video_info') or {}
formats = []
subtitles = {}
for variant in video_info.get('variants', []):
formats.extend(self._extract_variant_formats(variant, twid))
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
self._sort_formats(formats)
thumbnails = []
@ -491,6 +499,7 @@ def add_thumbnail(name, size):
info.update({
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
@ -540,7 +549,7 @@ def get_binding_value(k):
is_amplify = card_name == 'amplify'
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
self._sort_formats(formats)
thumbnails = []
@ -558,6 +567,7 @@ def get_binding_value(k):
info.update({
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': int_or_none(get_binding_value(
'content_duration_seconds')),

View file

@ -30,7 +30,7 @@ class UplynkIE(InfoExtractor):
def _extract_uplynk_info(self, uplynk_content_url):
path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
display_id = video_id or external_id
formats = self._extract_m3u8_formats(
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
'http://content.uplynk.com/%s.m3u8' % path,
display_id, 'mp4', 'm3u8_native')
if session_id:
@ -48,6 +48,7 @@ def _extract_uplynk_info(self, uplynk_content_url):
'duration': float_or_none(asset.get('duration')),
'uploader_id': asset.get('owner'),
'formats': formats,
'subtitles': subtitles,
}
def _real_extract(self, url):

View file

@ -69,19 +69,24 @@ def _real_extract(self, url):
title = video_info['title']
formats = []
subtitles = {}
def extract_formats(manifest_urls):
for f, f_url in manifest_urls.items():
if not f_url:
continue
if f in ('dash', 'mpd'):
formats.extend(self._extract_mpd_formats(
fmts, subs = self._extract_mpd_formats_and_subtitles(
f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
video_id, mpd_id='dash', fatal=False))
video_id, mpd_id='dash', fatal=False)
elif f == 'hls':
formats.extend(self._extract_m3u8_formats(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
f_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
'm3u8_native', m3u8_id='hls', fatal=False)
else:
continue
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
delivery = video_data.get('delivery') or {}
extract_formats({delivery.get('format'): delivery.get('url')})
@ -103,4 +108,5 @@ def extract_formats(manifest_urls):
video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),
'duration': int_or_none(video_info.get('duration')),
'formats': formats,
'subtitles': subtitles,
}

View file

@ -2340,15 +2340,20 @@ def make_HTTPS_handler(params, **kwargs):
return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
def bug_reports_message():
def bug_reports_message(before=';'):
if ytdl_is_updateable():
update_cmd = 'type yt-dlp -U to update'
else:
update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
msg = '; please report this issue on https://github.com/yt-dlp/yt-dlp .'
msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
msg += ' Make sure you are using the latest version; %s.' % update_cmd
msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
return msg
before = before.rstrip()
if not before or before.endswith(('.', '!', '?')):
msg = msg[0].title() + msg[1:]
return (before + ' ' if before else '') + msg
class YoutubeDLError(Exception):

378
yt_dlp/webvtt.py Normal file
View file

@ -0,0 +1,378 @@
# coding: utf-8
from __future__ import unicode_literals, print_function, division
"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.
Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""
import re
import io
from .utils import int_or_none
from .compat import (
compat_str as str,
compat_Pattern,
compat_Match,
)
class _MatchParser(object):
"""
An object that maintains the current parsing position and allows
conveniently advancing it as syntax elements are successfully parsed.
"""
def __init__(self, string):
self._data = string
self._pos = 0
def match(self, r):
if isinstance(r, compat_Pattern):
return r.match(self._data, self._pos)
if isinstance(r, str):
if self._data.startswith(r, self._pos):
return len(r)
return None
raise ValueError(r)
def advance(self, by):
if by is None:
amt = 0
elif isinstance(by, compat_Match):
amt = len(by.group(0))
elif isinstance(by, str):
amt = len(by)
elif isinstance(by, int):
amt = by
else:
raise ValueError(by)
self._pos += amt
return by
def consume(self, r):
return self.advance(self.match(r))
def child(self):
return _MatchChildParser(self)
class _MatchChildParser(_MatchParser):
"""
A child parser state, which advances through the same data as
its parent, but has an independent position. This is useful when
advancing through syntax elements we might later want to backtrack
from.
"""
def __init__(self, parent):
super(_MatchChildParser, self).__init__(parent._data)
self.__parent = parent
self._pos = parent._pos
def commit(self):
"""
Advance the parent state to the current position of this child state.
"""
self.__parent._pos = self._pos
return self.__parent
class ParseError(Exception):
def __init__(self, parser):
super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
parser._pos, parser._data[parser._pos:parser._pos + 20]
))
_REGEX_TS = re.compile(r'''(?x)
(?:([0-9]{2,}):)?
([0-9]{2}):
([0-9]{2})\.
([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
def _parse_ts(ts):
"""
Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
"""
h, min, s, ms = ts.groups()
return 90 * (
int(h or 0) * 3600000 + # noqa: W504,E221,E222
int(min) * 60000 + # noqa: W504,E221,E222
int(s) * 1000 + # noqa: W504,E221,E222
int(ms) # noqa: W504,E221,E222
)
def _format_ts(ts):
"""
Convert an MPEG PES timestamp into a WebVTT timestamp.
This will lose sub-millisecond precision.
"""
ts = int((ts + 45) // 90)
ms , ts = divmod(ts, 1000) # noqa: W504,E221,E222,E203
s , ts = divmod(ts, 60) # noqa: W504,E221,E222,E203
min, h = divmod(ts, 60) # noqa: W504,E221,E222
return '%02u:%02u:%02u.%03u' % (h, min, s, ms)
class Block(object):
"""
An abstract WebVTT block.
"""
def __init__(self, **kwargs):
for key, val in kwargs.items():
setattr(self, key, val)
@classmethod
def parse(cls, parser):
m = parser.match(cls._REGEX)
if not m:
return None
parser.advance(m)
return cls(raw=m.group(0))
def write_into(self, stream):
stream.write(self.raw)
class HeaderBlock(Block):
"""
A WebVTT block that may only appear in the header part of the file,
i.e. before any cue blocks.
"""
pass
class Magic(HeaderBlock):
_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
# doesnt specify the exact grammar nor where in the WebVTT
# syntax it should be placed; the below has been devised based
# on usage in the wild
#
# And strictly speaking, the presence of this extension violates
# the W3C WebVTT spec. Oh well.
_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
@classmethod
def __parse_tsmap(cls, parser):
parser = parser.child()
while True:
m = parser.consume(cls._REGEX_TSMAP_LOCAL)
if m:
m = parser.consume(_REGEX_TS)
if m is None:
raise ParseError(parser)
local = _parse_ts(m)
if local is None:
raise ParseError(parser)
else:
m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
if m:
mpegts = int_or_none(m.group(1))
if mpegts is None:
raise ParseError(parser)
else:
raise ParseError(parser)
if parser.consume(','):
continue
if parser.consume(_REGEX_NL):
break
raise ParseError(parser)
parser.commit()
return local, mpegts
@classmethod
def parse(cls, parser):
parser = parser.child()
m = parser.consume(cls._REGEX)
if not m:
raise ParseError(parser)
extra = m.group(1)
local, mpegts = None, None
if parser.consume(cls._REGEX_TSMAP):
local, mpegts = cls.__parse_tsmap(parser)
if not parser.consume(_REGEX_NL):
raise ParseError(parser)
parser.commit()
return cls(extra=extra, mpegts=mpegts, local=local)
def write_into(self, stream):
stream.write('WEBVTT')
if self.extra is not None:
stream.write(self.extra)
stream.write('\n')
if self.local or self.mpegts:
stream.write('X-TIMESTAMP-MAP=LOCAL:')
stream.write(_format_ts(self.local if self.local is not None else 0))
stream.write(',MPEGTS:')
stream.write(str(self.mpegts if self.mpegts is not None else 0))
stream.write('\n')
stream.write('\n')
class StyleBlock(HeaderBlock):
_REGEX = re.compile(r'''(?x)
STYLE[\ \t]*(?:\r\n|[\r\n])
((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
(?:\r\n|[\r\n])
''')
class RegionBlock(HeaderBlock):
_REGEX = re.compile(r'''(?x)
REGION[\ \t]*
((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
(?:\r\n|[\r\n])
''')
class CommentBlock(Block):
_REGEX = re.compile(r'''(?x)
NOTE(?:\r\n|[\ \t\r\n])
((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
(?:\r\n|[\r\n])
''')
class CueBlock(Block):
"""
A cue block. The payload is not interpreted.
"""
_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
@classmethod
def parse(cls, parser):
parser = parser.child()
id = None
m = parser.consume(cls._REGEX_ID)
if m:
id = m.group(1)
m0 = parser.consume(_REGEX_TS)
if not m0:
return None
if not parser.consume(cls._REGEX_ARROW):
return None
m1 = parser.consume(_REGEX_TS)
if not m1:
return None
m2 = parser.consume(cls._REGEX_SETTINGS)
if not parser.consume(_REGEX_NL):
return None
start = _parse_ts(m0)
end = _parse_ts(m1)
settings = m2.group(1) if m2 is not None else None
text = io.StringIO()
while True:
m = parser.consume(cls._REGEX_PAYLOAD)
if not m:
break
text.write(m.group(0))
parser.commit()
return cls(
id=id,
start=start, end=end, settings=settings,
text=text.getvalue()
)
def write_into(self, stream):
if self.id is not None:
stream.write(self.id)
stream.write('\n')
stream.write(_format_ts(self.start))
stream.write(' --> ')
stream.write(_format_ts(self.end))
if self.settings is not None:
stream.write(' ')
stream.write(self.settings)
stream.write('\n')
stream.write(self.text)
stream.write('\n')
@property
def as_json(self):
return {
'id': self.id,
'start': self.start,
'end': self.end,
'text': self.text,
'settings': self.settings,
}
def parse_fragment(frag_content):
"""
A generator that yields (partially) parsed WebVTT blocks when given
a bytes object containing the raw contents of a WebVTT file.
"""
parser = _MatchParser(frag_content.decode('utf-8'))
yield Magic.parse(parser)
while not parser.match(_REGEX_EOF):
if parser.consume(_REGEX_BLANK):
continue
block = RegionBlock.parse(parser)
if block:
yield block
continue
block = StyleBlock.parse(parser)
if block:
yield block
continue
block = CommentBlock.parse(parser)
if block:
yield block # XXX: or skip
continue
break
while not parser.match(_REGEX_EOF):
if parser.consume(_REGEX_BLANK):
continue
block = CommentBlock.parse(parser)
if block:
yield block # XXX: or skip
continue
block = CueBlock.parse(parser)
if block:
yield block
continue
raise ParseError(parser)