[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz
This commit is contained in:
Felix S 2021-07-16 16:22:56 +02:00 committed by GitHub
parent 3b297919e0
commit da1c94ee45
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 11 deletions

View file

@ -2206,7 +2206,7 @@ def _xpath_ns(path, namespace=None):
out.append('{%s}%s' % (namespace, c)) out.append('{%s}%s' % (namespace, c))
return '/'.join(out) return '/'.join(out)
def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if smil is False: if smil is False:
@ -2215,8 +2215,21 @@ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None,
namespace = self._parse_smil_namespace(smil) namespace = self._parse_smil_namespace(smil)
return self._parse_smil_formats( fmts = self._parse_smil_formats(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
subs = self._parse_smil_subtitles(
smil, namespace=namespace)
return fmts, subs
def _extract_smil_formats(self, *args, **kwargs):
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
if subs:
self.report_warning(bug_reports_message(
"Ignoring subtitle tracks found in the SMIL manifest; "
"if any subtitle tracks are missing,"
))
return fmts
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal) smil = self._download_smil(smil_url, video_id, fatal=fatal)

View file

@ -2462,7 +2462,7 @@ def _real_extract(self, url):
# Is it an M3U playlist? # Is it an M3U playlist?
if first_bytes.startswith(b'#EXTM3U'): if first_bytes.startswith(b'#EXTM3U'):
info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
self._sort_formats(info_dict['formats']) self._sort_formats(info_dict['formats'])
return info_dict return info_dict
@ -3410,6 +3410,7 @@ def _real_extract(self, url):
if not isinstance(sources, list): if not isinstance(sources, list):
sources = [sources] sources = [sources]
formats = [] formats = []
subtitles = {}
for source in sources: for source in sources:
src = source.get('src') src = source.get('src')
if not src or not isinstance(src, compat_str): if not src or not isinstance(src, compat_str):
@ -3422,12 +3423,16 @@ def _real_extract(self, url):
if src_type == 'video/youtube': if src_type == 'video/youtube':
return self.url_result(src, YoutubeIE.ie_key()) return self.url_result(src, YoutubeIE.ie_key())
if src_type == 'application/dash+xml' or ext == 'mpd': if src_type == 'application/dash+xml' or ext == 'mpd':
formats.extend(self._extract_mpd_formats( fmts, subs = self._extract_mpd_formats_and_subtitles(
src, video_id, mpd_id='dash', fatal=False)) src, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif src_type == 'application/x-mpegurl' or ext == 'm3u8': elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( fmts, subs = self._extract_m3u8_formats_and_subtitles(
src, video_id, 'mp4', entry_protocol='m3u8_native', src, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)) m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else: else:
formats.append({ formats.append({
'url': src, 'url': src,
@ -3437,9 +3442,10 @@ def _real_extract(self, url):
'Referer': full_response.geturl(), 'Referer': full_response.geturl(),
}, },
}) })
if formats: if formats or subtitles:
self._sort_formats(formats) self._sort_formats(formats)
info_dict['formats'] = formats info_dict['formats'] = formats
info_dict['subtitles'] = subtitles
return info_dict return info_dict
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
@ -3574,13 +3580,13 @@ def filter_video(urls):
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'smil': if ext == 'smil':
entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
elif ext == 'xspf': elif ext == 'xspf':
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
elif ext == 'm3u8': elif ext == 'm3u8':
entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4')
elif ext == 'mpd': elif ext == 'mpd':
entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id)
elif ext == 'f4m': elif ext == 'f4m':
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: