[extractor] Prevent unnecessary download of hls manifests

and refactor `hls_split_discontinuity` code
This commit is contained in:
pukkandan 2021-07-07 02:24:58 +05:30
parent 723d44b92b
commit 60755938b3
No known key found for this signature in database
GPG key ID: 0F00D95A001F4698

View file

@ -1979,24 +1979,33 @@ def _parse_m3u8_formats_and_subtitles(
preference=None, quality=None, m3u8_id=None, live=False, note=None, preference=None, quality=None, m3u8_id=None, live=False, note=None,
errnote=None, fatal=True, data=None, headers={}, query={}, errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None): video_id=None):
formats, subtitles = [], {}
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return [], {} return formats, subtitles
if (not self.get_param('allow_unplayable_formats') if (not self.get_param('allow_unplayable_formats')
and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
return [], {} return formats, subtitles
formats = [] def format_url(url):
return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
subtitles = {} if self.get_param('hls_split_discontinuity', False):
def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
if not m3u8_doc:
if not manifest_url:
return []
m3u8_doc = self._download_webpage(
manifest_url, video_id, fatal=fatal, data=data, headers=headers,
note=False, errnote='Failed to download m3u8 playlist information')
if m3u8_doc is False:
return []
return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
format_url = lambda u: ( else:
u def _extract_m3u8_playlist_indices(*args, **kwargs):
if re.match(r'^https?://', u) return [None]
else compat_urlparse.urljoin(m3u8_url, u))
split_discontinuity = self.get_param('hls_split_discontinuity', False)
# References: # References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
@ -2014,68 +2023,16 @@ def _parse_m3u8_formats_and_subtitles(
# media playlist and MUST NOT appear in master playlist thus we can # media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion. # clearly detect media playlist with this criterion.
def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
fatal=True, data=None, headers={}):
if not m3u8_doc:
if not format_url:
return []
res = self._download_webpage_handle(
format_url, video_id,
note=False,
errnote='Failed to download m3u8 playlist information',
fatal=fatal, data=data, headers=headers)
if res is False:
return []
m3u8_doc, urlh = res
format_url = urlh.geturl()
playlist_formats = []
i = (
0
if split_discontinuity
else None)
format_info = {
'index': i,
'key_data': None,
'files': [],
}
for line in m3u8_doc.splitlines():
if not line.startswith('#'):
format_info['files'].append(line)
elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
i += 1
playlist_formats.append(format_info)
format_info = {
'index': i,
'url': format_url,
'files': [],
}
playlist_formats.append(format_info)
return playlist_formats
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
formats = [{
playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc) 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
'format_index': idx,
for format in playlist_formats:
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
format_index = format.get('index')
if format_index:
format_id.append(str(format_index))
f = {
'format_id': '-'.join(format_id),
'format_index': format_index,
'url': m3u8_url, 'url': m3u8_url,
'ext': ext, 'ext': ext,
'protocol': entry_protocol, 'protocol': entry_protocol,
'preference': preference, 'preference': preference,
'quality': quality, 'quality': quality,
} } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
formats.append(f)
return formats, subtitles return formats, subtitles
@ -2115,21 +2072,10 @@ def extract_media(x_media_line):
media_url = media.get('URI') media_url = media.get('URI')
if media_url: if media_url:
manifest_url = format_url(media_url) manifest_url = format_url(media_url)
format_id = [] formats.extend({
playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id, 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
fatal=fatal, data=data, headers=headers)
for format in playlist_formats:
format_index = format.get('index')
for v in (m3u8_id, group_id, name):
if v:
format_id.append(v)
if format_index:
format_id.append(str(format_index))
f = {
'format_id': '-'.join(format_id),
'format_note': name, 'format_note': name,
'format_index': format_index, 'format_index': idx,
'url': manifest_url, 'url': manifest_url,
'manifest_url': m3u8_url, 'manifest_url': m3u8_url,
'language': media.get('LANGUAGE'), 'language': media.get('LANGUAGE'),
@ -2137,10 +2083,8 @@ def extract_media(x_media_line):
'protocol': entry_protocol, 'protocol': entry_protocol,
'preference': preference, 'preference': preference,
'quality': quality, 'quality': quality,
} 'vcodec': 'none' if media_type == 'AUDIO' else None,
if media_type == 'AUDIO': } for idx in _extract_m3u8_playlist_indices(manifest_url))
f['vcodec'] = 'none'
formats.append(f)
def build_stream_name(): def build_stream_name():
# Despite specification does not mention NAME attribute for # Despite specification does not mention NAME attribute for
@ -2179,25 +2123,17 @@ def build_stream_name():
or last_stream_inf.get('BANDWIDTH'), scale=1000) or last_stream_inf.get('BANDWIDTH'), scale=1000)
manifest_url = format_url(line.strip()) manifest_url = format_url(line.strip())
playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id, for idx in _extract_m3u8_playlist_indices(manifest_url):
fatal=fatal, data=data, headers=headers) format_id = [m3u8_id, None, idx]
for frmt in playlist_formats:
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
format_index = frmt.get('index')
stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making # Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided # format_id unpredictable. So it's better to keep provided
# format_id intact. # format_id intact.
if not live: if not live:
format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) stream_name = build_stream_name()
if format_index: format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
format_id.append(str(format_index))
f = { f = {
'format_id': '-'.join(format_id), 'format_id': '-'.join(map(str, filter(None, format_id))),
'format_index': format_index, 'format_index': idx,
'url': manifest_url, 'url': manifest_url,
'manifest_url': m3u8_url, 'manifest_url': m3u8_url,
'tbr': tbr, 'tbr': tbr,