mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-01-07 08:31:17 +00:00
[generic] Extract previously missed subtitles (#515)
* [generic] Extract subtitles in cases missed previously * [common] Detect discarded subtitles in SMIL manifests * [generic] Extract everything in the SMIL manifest Authored by: fstirlitz
This commit is contained in:
parent
3b297919e0
commit
da1c94ee45
|
@ -2206,7 +2206,7 @@ def _xpath_ns(path, namespace=None):
|
||||||
out.append('{%s}%s' % (namespace, c))
|
out.append('{%s}%s' % (namespace, c))
|
||||||
return '/'.join(out)
|
return '/'.join(out)
|
||||||
|
|
||||||
def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
|
def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
|
||||||
smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
|
smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
|
||||||
|
|
||||||
if smil is False:
|
if smil is False:
|
||||||
|
@ -2215,8 +2215,21 @@ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None,
|
||||||
|
|
||||||
namespace = self._parse_smil_namespace(smil)
|
namespace = self._parse_smil_namespace(smil)
|
||||||
|
|
||||||
return self._parse_smil_formats(
|
fmts = self._parse_smil_formats(
|
||||||
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
||||||
|
subs = self._parse_smil_subtitles(
|
||||||
|
smil, namespace=namespace)
|
||||||
|
|
||||||
|
return fmts, subs
|
||||||
|
|
||||||
|
def _extract_smil_formats(self, *args, **kwargs):
|
||||||
|
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
|
||||||
|
if subs:
|
||||||
|
self.report_warning(bug_reports_message(
|
||||||
|
"Ignoring subtitle tracks found in the SMIL manifest; "
|
||||||
|
"if any subtitle tracks are missing,"
|
||||||
|
))
|
||||||
|
return fmts
|
||||||
|
|
||||||
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
|
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
|
||||||
smil = self._download_smil(smil_url, video_id, fatal=fatal)
|
smil = self._download_smil(smil_url, video_id, fatal=fatal)
|
||||||
|
|
|
@ -2462,7 +2462,7 @@ def _real_extract(self, url):
|
||||||
|
|
||||||
# Is it an M3U playlist?
|
# Is it an M3U playlist?
|
||||||
if first_bytes.startswith(b'#EXTM3U'):
|
if first_bytes.startswith(b'#EXTM3U'):
|
||||||
info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
|
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
|
||||||
self._sort_formats(info_dict['formats'])
|
self._sort_formats(info_dict['formats'])
|
||||||
return info_dict
|
return info_dict
|
||||||
|
|
||||||
|
@ -3410,6 +3410,7 @@ def _real_extract(self, url):
|
||||||
if not isinstance(sources, list):
|
if not isinstance(sources, list):
|
||||||
sources = [sources]
|
sources = [sources]
|
||||||
formats = []
|
formats = []
|
||||||
|
subtitles = {}
|
||||||
for source in sources:
|
for source in sources:
|
||||||
src = source.get('src')
|
src = source.get('src')
|
||||||
if not src or not isinstance(src, compat_str):
|
if not src or not isinstance(src, compat_str):
|
||||||
|
@ -3422,12 +3423,16 @@ def _real_extract(self, url):
|
||||||
if src_type == 'video/youtube':
|
if src_type == 'video/youtube':
|
||||||
return self.url_result(src, YoutubeIE.ie_key())
|
return self.url_result(src, YoutubeIE.ie_key())
|
||||||
if src_type == 'application/dash+xml' or ext == 'mpd':
|
if src_type == 'application/dash+xml' or ext == 'mpd':
|
||||||
formats.extend(self._extract_mpd_formats(
|
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||||
src, video_id, mpd_id='dash', fatal=False))
|
src, video_id, mpd_id='dash', fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
|
elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
|
||||||
formats.extend(self._extract_m3u8_formats(
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
src, video_id, 'mp4', entry_protocol='m3u8_native',
|
src, video_id, 'mp4', entry_protocol='m3u8_native',
|
||||||
m3u8_id='hls', fatal=False))
|
m3u8_id='hls', fatal=False)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
else:
|
else:
|
||||||
formats.append({
|
formats.append({
|
||||||
'url': src,
|
'url': src,
|
||||||
|
@ -3437,9 +3442,10 @@ def _real_extract(self, url):
|
||||||
'Referer': full_response.geturl(),
|
'Referer': full_response.geturl(),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
if formats:
|
if formats or subtitles:
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
info_dict['formats'] = formats
|
info_dict['formats'] = formats
|
||||||
|
info_dict['subtitles'] = subtitles
|
||||||
return info_dict
|
return info_dict
|
||||||
|
|
||||||
# Looking for http://schema.org/VideoObject
|
# Looking for http://schema.org/VideoObject
|
||||||
|
@ -3574,13 +3580,13 @@ def filter_video(urls):
|
||||||
|
|
||||||
ext = determine_ext(video_url)
|
ext = determine_ext(video_url)
|
||||||
if ext == 'smil':
|
if ext == 'smil':
|
||||||
entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
|
entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
|
||||||
elif ext == 'xspf':
|
elif ext == 'xspf':
|
||||||
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
|
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
|
||||||
elif ext == 'm3u8':
|
elif ext == 'm3u8':
|
||||||
entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
|
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4')
|
||||||
elif ext == 'mpd':
|
elif ext == 'mpd':
|
||||||
entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
|
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id)
|
||||||
elif ext == 'f4m':
|
elif ext == 'f4m':
|
||||||
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
|
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
|
||||||
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
|
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
|
||||||
|
|
Loading…
Reference in a new issue