[twitter] Extract subtitles from HLS manifests

This commit is contained in:
Felix S 2021-04-15 14:12:59 +02:00
parent efe9dba595
commit 4bed436371

View file

@ -36,9 +36,9 @@ class TwitterBaseIE(InfoExtractor):
def _extract_variant_formats(self, variant, video_id): def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url') variant_url = variant.get('url')
if not variant_url: if not variant_url:
return [] return [], {}
elif '.m3u8' in variant_url: elif '.m3u8' in variant_url:
return self._extract_m3u8_formats( return self._extract_m3u8_formats_and_subtitles(
variant_url, video_id, 'mp4', 'm3u8_native', variant_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False) m3u8_id='hls', fatal=False)
else: else:
@ -49,22 +49,27 @@ def _extract_variant_formats(self, variant, video_id):
'tbr': tbr, 'tbr': tbr,
} }
self._search_dimensions_in_video_url(f, variant_url) self._search_dimensions_in_video_url(f, variant_url)
return [f] return [f], {}
def _extract_formats_from_vmap_url(self, vmap_url, video_id): def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id) vmap_data = self._download_xml(vmap_url, video_id)
formats = [] formats = []
subtitles = {}
urls = [] urls = []
for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
video_variant.attrib['url'] = compat_urllib_parse_unquote( video_variant.attrib['url'] = compat_urllib_parse_unquote(
video_variant.attrib['url']) video_variant.attrib['url'])
urls.append(video_variant.attrib['url']) urls.append(video_variant.attrib['url'])
formats.extend(self._extract_variant_formats( fmts, subs = self._extract_variant_formats(
video_variant.attrib, video_id)) video_variant.attrib, video_id)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
if video_url not in urls: if video_url not in urls:
formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) fmts, subs = self._extract_variant_formats({'url': video_url}, video_id)
return formats formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
return formats, subtitles
@staticmethod @staticmethod
def _search_dimensions_in_video_url(a_format, video_url): def _search_dimensions_in_video_url(a_format, video_url):
@ -471,8 +476,11 @@ def extract_from_video_info(media):
video_info = media.get('video_info') or {} video_info = media.get('video_info') or {}
formats = [] formats = []
subtitles = {}
for variant in video_info.get('variants', []): for variant in video_info.get('variants', []):
formats.extend(self._extract_variant_formats(variant, twid)) fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
self._sort_formats(formats) self._sort_formats(formats)
thumbnails = [] thumbnails = []
@ -491,6 +499,7 @@ def add_thumbnail(name, size):
info.update({ info.update({
'formats': formats, 'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000), 'duration': float_or_none(video_info.get('duration_millis'), 1000),
}) })
@ -540,7 +549,7 @@ def get_binding_value(k):
is_amplify = card_name == 'amplify' is_amplify = card_name == 'amplify'
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
self._sort_formats(formats) self._sort_formats(formats)
thumbnails = [] thumbnails = []
@ -558,6 +567,7 @@ def get_binding_value(k):
info.update({ info.update({
'formats': formats, 'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'duration': int_or_none(get_binding_value( 'duration': int_or_none(get_binding_value(
'content_duration_seconds')), 'content_duration_seconds')),