From f14a2d838240e9e75fe52d4e381156064e90674c Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 8 Jul 2022 03:25:04 +0530 Subject: [PATCH] [extractor/html5] Separate into own extractor (#4307) Closes #4291 Authored by: coletdjnz, pukkandan --- test/test_http.py | 4 ++-- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/generic.py | 19 ------------------- yt_dlp/extractor/genericembeds.py | 27 +++++++++++++++++++++++++++ 4 files changed, 30 insertions(+), 21 deletions(-) create mode 100644 yt_dlp/extractor/genericembeds.py diff --git a/test/test_http.py b/test/test_http.py index b1aac77206..5ca0d7a470 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -85,7 +85,7 @@ def test_nocheckcertificate(self): ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) - self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) + self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) class TestClientCert(unittest.TestCase): @@ -113,7 +113,7 @@ def _run_test(self, **params): **params, }) r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) - self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) + self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) def test_certificate_combined_nopass(self): self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt')) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b62b8113c1..221c1598df 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -662,6 +662,7 @@ HSEShowIE, HSEProductIE, ) +from .genericembeds import HTML5MediaEmbedIE from .huajiao import HuajiaoIE from .huya import HuyaLiveIE from .huffpost import HuffPostIE diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 3d574cd022..ec1cbf005f 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -3776,25 +3776,6 @@ def _real_extract(self, url): elif embeds: return self.playlist_result(embeds, **info_dict) - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - self.report_detected('HTML5 media') - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': f'{video_id}-{num}', - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) - jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: diff --git a/yt_dlp/extractor/genericembeds.py b/yt_dlp/extractor/genericembeds.py new file mode 100644 index 0000000000..ec2673059d --- /dev/null +++ b/yt_dlp/extractor/genericembeds.py @@ -0,0 +1,27 @@ +from .common import InfoExtractor + + +class HTML5MediaEmbedIE(InfoExtractor): + _VALID_URL = False + IE_NAME = 'html5' + _WEBPAGE_TESTS = [ + { + 'url': 'https://html.com/media/', + 'info_dict': { + 'title': 'HTML5 Media', + 'description': 'md5:933b2d02ceffe7a7a0f3c8326d91cc2a', + }, + 'playlist_count': 2 + } + ] + + def _extract_from_webpage(self, url, webpage): + video_id, title = self._generic_id(url), self._generic_title(url) + entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': f'{video_id}-{num}', + 'title': f'{title} ({num})', + }) + self._sort_formats(entry['formats']) + yield entry