[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz
This commit is contained in:
felix 2021-05-23 18:34:49 +02:00 committed by pukkandan
parent 4d85fbbdbb
commit cdb19aa4c2
No known key found for this signature in database
GPG key ID: 0F00D95A001F4698
Notes: pukkandan 2021-06-13 22:41:29 +05:30
This also adds extracting storyboards from DASH manifest as mhtml
6 changed files with 248 additions and 16 deletions

View file

@ -22,6 +22,7 @@ def _get_real_downloader(info_dict, protocol=None, *args, **kwargs):
from .rtmp import RtmpFD from .rtmp import RtmpFD
from .rtsp import RtspFD from .rtsp import RtspFD
from .ism import IsmFD from .ism import IsmFD
from .mhtml import MhtmlFD
from .niconico import NiconicoDmcFD from .niconico import NiconicoDmcFD
from .youtube_live_chat import YoutubeLiveChatReplayFD from .youtube_live_chat import YoutubeLiveChatReplayFD
from .external import ( from .external import (
@ -39,6 +40,7 @@ def _get_real_downloader(info_dict, protocol=None, *args, **kwargs):
'f4m': F4mFD, 'f4m': F4mFD,
'http_dash_segments': DashSegmentsFD, 'http_dash_segments': DashSegmentsFD,
'ism': IsmFD, 'ism': IsmFD,
'mhtml': MhtmlFD,
'niconico_dmc': NiconicoDmcFD, 'niconico_dmc': NiconicoDmcFD,
'youtube_live_chat_replay': YoutubeLiveChatReplayFD, 'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
} }

202
yt_dlp/downloader/mhtml.py Normal file
View file

@ -0,0 +1,202 @@
# coding: utf-8
from __future__ import unicode_literals
import io
import quopri
import re
import uuid
from .fragment import FragmentFD
from ..utils import (
escapeHTML,
formatSeconds,
srt_subtitles_timecode,
urljoin,
)
from ..version import __version__ as YT_DLP_VERSION
class MhtmlFD(FragmentFD):
FD_NAME = 'mhtml'
_STYLESHEET = """\
html, body {
margin: 0;
padding: 0;
height: 100vh;
}
html {
overflow-y: scroll;
scroll-snap-type: y mandatory;
}
body {
scroll-snap-type: y mandatory;
display: flex;
flex-flow: column;
}
body > figure {
max-width: 100vw;
max-height: 100vh;
scroll-snap-align: center;
}
body > figure > figcaption {
text-align: center;
height: 2.5em;
}
body > figure > img {
display: block;
margin: auto;
max-width: 100%;
max-height: calc(100vh - 5em);
}
"""
_STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
_STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
@staticmethod
def _escape_mime(s):
return '=?utf-8?Q?' + (b''.join(
bytes((b,)) if b >= 0x20 else b'=%02X' % b
for b in quopri.encodestring(s.encode('utf-8'), header=True)
)).decode('us-ascii') + '?='
def _gen_cid(self, i, fragment, frag_boundary):
return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
def _gen_stub(self, *, fragments, frag_boundary, title):
output = io.StringIO()
output.write((
'<!DOCTYPE html>'
'<html>'
'<head>'
'' '<meta name="generator" content="yt-dlp {version}">'
'' '<title>{title}</title>'
'' '<style>{styles}</style>'
'<body>'
).format(
version=escapeHTML(YT_DLP_VERSION),
styles=self._STYLESHEET,
title=escapeHTML(title)
))
t0 = 0
for i, frag in enumerate(fragments):
output.write('<figure>')
try:
t1 = t0 + frag['duration']
output.write((
'<figcaption>Slide #{num}: {t0} {t1} (duration: {duration})</figcaption>'
).format(
num=i + 1,
t0=srt_subtitles_timecode(t0),
t1=srt_subtitles_timecode(t1),
duration=formatSeconds(frag['duration'], msec=True)
))
except (KeyError, ValueError, TypeError):
t1 = None
output.write((
'<figcaption>Slide #{num}</figcaption>'
).format(num=i + 1))
output.write('<img src="cid:{cid}">'.format(
cid=self._gen_cid(i, frag, frag_boundary)))
output.write('</figure>')
t0 = t1
return output.getvalue()
def real_download(self, filename, info_dict):
fragment_base_url = info_dict.get('fragment_base_url')
fragments = info_dict['fragments'][:1] if self.params.get(
'test', False) else info_dict['fragments']
title = info_dict['title']
origin = info_dict['webpage_url']
ctx = {
'filename': filename,
'total_frags': len(fragments),
}
self._prepare_and_start_frag_download(ctx)
extra_state = ctx.setdefault('extra_state', {
'header_written': False,
'mime_boundary': str(uuid.uuid4()).replace('-', ''),
})
frag_boundary = extra_state['mime_boundary']
if not extra_state['header_written']:
stub = self._gen_stub(
fragments=fragments,
frag_boundary=frag_boundary,
title=title
)
ctx['dest_stream'].write((
'MIME-Version: 1.0\r\n'
'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
'Subject: {title}\r\n'
'Content-type: multipart/related; '
'' 'boundary="{boundary}"; '
'' 'type="text/html"\r\n'
'X.yt-dlp.Origin: {origin}\r\n'
'\r\n'
'--{boundary}\r\n'
'Content-Type: text/html; charset=utf-8\r\n'
'Content-Length: {length}\r\n'
'\r\n'
'{stub}\r\n'
).format(
origin=origin,
boundary=frag_boundary,
length=len(stub),
title=self._escape_mime(title),
stub=stub
).encode('utf-8'))
extra_state['header_written'] = True
for i, fragment in enumerate(fragments):
if (i + 1) <= ctx['fragment_index']:
continue
fragment_url = urljoin(fragment_base_url, fragment['path'])
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
if not success:
continue
mime_type = b'image/jpeg'
if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
mime_type = b'image/png'
if frag_content.startswith((b'GIF87a', b'GIF89a')):
mime_type = b'image/gif'
if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP':
mime_type = b'image/webp'
frag_header = io.BytesIO()
frag_header.write(
b'--%b\r\n' % frag_boundary.encode('us-ascii'))
frag_header.write(
b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
frag_header.write(
b'Content-type: %b\r\n' % mime_type)
frag_header.write(
b'Content-length: %u\r\n' % len(frag_content))
frag_header.write(
b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
frag_header.write(
b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
frag_header.write(b'\r\n')
self._append_fragment(
ctx, frag_header.getvalue() + frag_content + b'\r\n')
ctx['dest_stream'].write(
b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
self._finish_frag_download(ctx)
return True

View file

@ -24,7 +24,7 @@ class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107', 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': { 'info_dict': {
'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
@ -32,9 +32,9 @@ class CanvasIE(InfoExtractor):
'title': 'Nachtwacht: De Greystook', 'title': 'Nachtwacht: De Greystook',
'description': 'Nachtwacht: De Greystook', 'description': 'Nachtwacht: De Greystook',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1468.04, 'duration': 1468.02,
}, },
'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], 'expected_warnings': ['is not a supported codec'],
}, { }, {
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True, 'only_matching': True,

View file

@ -2126,6 +2126,7 @@ def extract_media(x_media_line):
format_id.append(str(format_index)) format_id.append(str(format_index))
f = { f = {
'format_id': '-'.join(format_id), 'format_id': '-'.join(format_id),
'format_note': name,
'format_index': format_index, 'format_index': format_index,
'url': manifest_url, 'url': manifest_url,
'manifest_url': m3u8_url, 'manifest_url': m3u8_url,
@ -2637,7 +2638,7 @@ def extract_Initialization(source):
mime_type = representation_attrib['mimeType'] mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
if content_type in ('video', 'audio', 'text'): if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
base_url = '' base_url = ''
for element in (representation, adaptation_set, period, mpd_doc): for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL')) base_url_e = element.find(_add_ns('BaseURL'))
@ -2654,9 +2655,15 @@ def extract_Initialization(source):
url_el = representation.find(_add_ns('BaseURL')) url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth')) bandwidth = int_or_none(representation_attrib.get('bandwidth'))
if representation_id is not None:
format_id = representation_id
else:
format_id = content_type
if mpd_id:
format_id = mpd_id + '-' + format_id
if content_type in ('video', 'audio'): if content_type in ('video', 'audio'):
f = { f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'format_id': format_id,
'manifest_url': mpd_url, 'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type), 'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')), 'width': int_or_none(representation_attrib.get('width')),
@ -2676,6 +2683,17 @@ def extract_Initialization(source):
'manifest_url': mpd_url, 'manifest_url': mpd_url,
'filesize': filesize, 'filesize': filesize,
} }
elif mime_type == 'image/jpeg':
# See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = {
'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers): def prepare_template(template_name, identifiers):
@ -2694,7 +2712,8 @@ def prepare_template(template_name, identifiers):
t += c t += c
# Next, $...$ templates are translated to their # Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator # %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id) if representation_id is not None:
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$') t.replace('$$', '$')
@ -2811,7 +2830,7 @@ def add_segment_url():
'url': mpd_url or base_url, 'url': mpd_url or base_url,
'fragment_base_url': base_url, 'fragment_base_url': base_url,
'fragments': [], 'fragments': [],
'protocol': 'http_dash_segments', 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
}) })
if 'initialization_url' in representation_ms_info: if 'initialization_url' in representation_ms_info:
initialization_url = representation_ms_info['initialization_url'] initialization_url = representation_ms_info['initialization_url']
@ -2822,7 +2841,7 @@ def add_segment_url():
else: else:
# Assuming direct URL to unfragmented media. # Assuming direct URL to unfragmented media.
f['url'] = base_url f['url'] = base_url
if content_type in ('video', 'audio'): if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
formats.append(f) formats.append(f)
elif content_type == 'text': elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f) subtitles.setdefault(lang or 'und', []).append(f)

View file

@ -142,6 +142,7 @@ class VikiIE(VikiBaseIE):
IE_NAME = 'viki' IE_NAME = 'viki'
_VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
_TESTS = [{ _TESTS = [{
'note': 'Free non-DRM video with storyboards in MPD',
'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1',
'info_dict': { 'info_dict': {
'id': '1175236v', 'id': '1175236v',
@ -155,7 +156,6 @@ class VikiIE(VikiBaseIE):
'params': { 'params': {
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': { 'info_dict': {
@ -173,7 +173,6 @@ class VikiIE(VikiBaseIE):
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'skip': 'Blocked in the US', 'skip': 'Blocked in the US',
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
# clip # clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@ -225,7 +224,6 @@ class VikiIE(VikiBaseIE):
'params': { 'params': {
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
# youtube external # youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -264,7 +262,6 @@ class VikiIE(VikiBaseIE):
'params': { 'params': {
'format': 'bestvideo', 'format': 'bestvideo',
}, },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -2244,6 +2244,17 @@ def unescapeHTML(s):
r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def escapeHTML(text):
return (
text
.replace('&', '&amp;')
.replace('<', '&lt;')
.replace('>', '&gt;')
.replace('"', '&quot;')
.replace("'", '&#39;')
)
def process_communicate_or_kill(p, *args, **kwargs): def process_communicate_or_kill(p, *args, **kwargs):
try: try:
return p.communicate(*args, **kwargs) return p.communicate(*args, **kwargs)
@ -2323,13 +2334,14 @@ def decodeOption(optval):
return optval return optval
def formatSeconds(secs, delim=':'): def formatSeconds(secs, delim=':', msec=False):
if secs > 3600: if secs > 3600:
return '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60) ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60)
elif secs > 60: elif secs > 60:
return '%d%s%02d' % (secs // 60, delim, secs % 60) ret = '%d%s%02d' % (secs // 60, delim, secs % 60)
else: else:
return '%d' % secs ret = '%d' % secs
return '%s.%03d' % (ret, secs % 1) if msec else ret
def make_HTTPS_handler(params, **kwargs): def make_HTTPS_handler(params, **kwargs):