From 4de94b9e165bfd6421a692f5f2eabcdb08edcb71 Mon Sep 17 00:00:00 2001 From: garret Date: Mon, 9 Oct 2023 19:00:26 +0100 Subject: [PATCH] [ie/nhk] Fix Japanese-language VOD extraction (#8309) Closes #8303 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 68 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index bcbc2279f..f6b5c501b 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -68,11 +68,12 @@ def _extract_formats_and_subtitles(self, vod_id): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None - lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if len(episode_id) == 7: + lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') + is_video = m_type == 'video' + + if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' if fetch_episode: episode = self._call_api( episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] @@ -133,47 +134,46 @@ def get_clean_field(key): class NhkVodIE(NhkBaseIE): # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = r'%s%s(?P[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?Pvideo)/(?P[0-9a-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}/(?Paudio)/(?P[^/?#]+?-\d{{8}}-[0-9a-z]+)'] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2049126/', 'info_dict': { - 'id': 'yd8322ch', + 'id': 'nw_vod_v_en_2049_126_20230413233000_01_1681398302', 'ext': 'mp4', - 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', - 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', - 'upload_date': '20230514', - 'timestamp': 1684083791, - 'series': 'GRAND SUMO Highlights', - 'episode': '[Recap] May Tournament Day 1 (Opening Day)', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', + 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', + 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', + 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', + 'series': 'Japan Railway Journal', }, }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'md5': '153c3016dfd252ba09726588149cf0e7', 'info_dict': { - 'id': 'a95j5iza', + 'id': 'lpZXIwaDE6_Z-976CPsFdxyICyWUzlT5', 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', }, }, { - # audio clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + # radio + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/livinginjapan-20231001-1/', 'info_dict': { - 'id': 'r_inventions-20201104-1-en', + 'id': 'livinginjapan-20231001-1-en', 'ext': 'm4a', - 'title': "Japan's Top Inventions - Miniature Video Cameras", - 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', + 'series': 'Living in Japan', + 'description': 'md5:850611969932874b4a3309e0cae06c2f', + 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', + 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, - 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -199,6 +199,19 @@ class NhkVodIE(NhkBaseIE): 'timestamp': 1623722008, }, 'skip': '404 Not Found', + }, { + # japanese-language, longer id than english + 'url': 'https://www3.nhk.or.jp/nhkworld/ja/ondemand/video/0020271111/', + 'info_dict': { + 'id': 'nw_ja_v_jvod_ohayou_20231008', + 'ext': 'mp4', + 'title': 'おはよう日本(7時台) - 10月8日放送', + 'series': 'おはよう日本(7時台)', + 'episode': '10月8日放送', + 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4', + 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', + }, + 'skip': 'expires 2023-10-15', }] def _real_extract(self, url): @@ -206,7 +219,7 @@ def _real_extract(self, url): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'%s/program%s(?P[0-9a-z]+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P\w+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' _TESTS = [{ # video program episodes 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', @@ -240,8 +253,7 @@ class NhkVodProgramIE(NhkBaseIE): }] def _real_extract(self, url): - lang, m_type, program_id, episode_type = self._match_valid_url(url).groups() - + lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( program_id, lang, m_type == 'video', False, episode_type == 'clip')