diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8d59360949..660524cb08 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -942,6 +942,10 @@ from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .kelbyone import KelbyOneIE +from .kenh14 import ( + Kenh14PlaylistIE, + Kenh14VideoIE, +) from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, diff --git a/yt_dlp/extractor/kenh14.py b/yt_dlp/extractor/kenh14.py index 77c7537eb7..22c2e7edc5 100644 --- a/yt_dlp/extractor/kenh14.py +++ b/yt_dlp/extractor/kenh14.py @@ -1,321 +1,150 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( clean_html, extract_attributes, get_element_by_class, get_element_html_by_attribute, - get_elements_by_attribute, - get_elements_html_by_attribute, + get_elements_html_by_class, int_or_none, parse_duration, parse_iso8601, - remove_end, remove_start, + str_or_none, + strip_or_none, + unescapeHTML, + url_or_none, ) +from ..utils.traversal import traverse_obj -class Kenh14IE(InfoExtractor): - IE_NAME = 'kenh14' - _VALID_URL = r'https?://video.kenh14\.vn/(?:playlist/)?[^/]*-(?P[0-9]+)\.chn' +class Kenh14VideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P[0-9]+)\.chn' _TESTS = [{ - 'url': 'https://video.kenh14.vn/video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn', - 'note': 'Video URL', - 'md5': '525b9c4646a7aed819697cfd17dd25a9', + 'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn', + 'md5': '1ed67f9c3a1e74acf15db69590cf6210', 'info_dict': { 'id': '316173', 'ext': 'mp4', 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', - 'description': 'Video mở hộp iPhone 14 Pro Max', - 'thumbnail': r're:^https?://.*\.jpg$', - 'thumbnails': list, - 'formats': list, - 'tags': ["iPhone 14 Pro", "iPhone 14 Pro Max", "iPhone 14"], - 'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy', + 'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], 'uploader': 'Unbox Therapy', - 'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy', 'upload_date': '20220517', 'view_count': int, - 'release_date': '20220518', - 'modified_date': '20220518', 'duration': 722.86, - 'modified_timestamp': 1652848039, - 'release_timestamp': 1652848039, - 'timestamp': int, - } - }, { - 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn', - 'note': 'Playlist URL', - 'info_dict': { - 'id': '316972', - 'ext': 'mp4', - 'description': 'md5:4212062bf4c447efbad5f54e6ab8d132', - 'thumbnail': 'https://kenh14cdn.com/203336854389633024/2022/6/5/1024-1281-1654416995812779954762.jpg', - - 'modified_timestamp': 1654376400, - 'duration': 4602.09, - 'timestamp': 1654398990, - 'release_date': '20220604', - 'tags': ['Sơn Soho', 'Linh Keen', 'Trần Tình (Naked Love) mùa 2', 'Naked Love', 'trần tình', 'ShowHot'], - 'release_timestamp': 1654376400, - 'view_count': int, - 'modified_date': '20220604', - 'title': '[4x5] FINAL - Naked Love EP4 - Moi quan he tieu cuc', - 'upload_date': '20220605', - 'display_id': 'md5:a27d1fbdeafeb740050de0e697c8e02e', - } - }, { - 'url': 'https://video.kenh14.vn/video-316173.chn', - 'note': 'javascript-based redirect; set via ', - 'md5': '525b9c4646a7aed819697cfd17dd25a9', - 'info_dict': { - 'id': '316173', - 'ext': 'mp4', - 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', - 'description': 'Video mở hộp iPhone 14 Pro Max', - 'thumbnail': r're:^https?://.*\.jpg$', - 'thumbnails': list, - 'formats': list, - 'tags': ["iPhone 14 Pro", "iPhone 14 Pro Max", "iPhone 14"], - 'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy', - 'uploader': 'Unbox Therapy', - 'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy', - 'upload_date': '20220517', - 'view_count': int, - 'release_date': '20220518', - 'modified_date': '20220518', - 'duration': 722.86, - 'modified_timestamp': 1652848039, - 'release_timestamp': 1652848039, - 'timestamp': int, - } - }, { - 'url': 'https://video.kenh14.vn/0-316173.chn', - 'note': 'HTTP 301 redirect to canonical URL', - 'md5': '525b9c4646a7aed819697cfd17dd25a9', - 'info_dict': { - 'id': '316173', - 'ext': 'mp4', - 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', - 'description': 'Video mở hộp iPhone 14 Pro Max', - 'thumbnail': r're:^https?://.*\.jpg$', - 'thumbnails': list, - 'formats': list, - 'tags': ["iPhone 14 Pro", "iPhone 14 Pro Max", "iPhone 14"], - 'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy', - 'uploader': 'Unbox Therapy', - 'display_id': 'video-mo-hop-iphone-14-pro-max-nguon-unbox-therapy', - 'upload_date': '20220517', - 'view_count': int, - 'release_date': '20220518', - 'modified_date': '20220518', - 'duration': 722.86, - 'modified_timestamp': 1652848039, - 'release_timestamp': 1652848039, - 'timestamp': int, - } - }, { - 'url': 'https://video.kenh14.vn/playlist/0-71.chn', - 'note': 'HTTP 301 redirect to canonical playlist URL', - 'info_dict': { - 'id': '316972', - 'ext': 'mp4', - 'description': 'md5:4212062bf4c447efbad5f54e6ab8d132', - 'thumbnail': 'https://kenh14cdn.com/203336854389633024/2022/6/5/1024-1281-1654416995812779954762.jpg', - - 'modified_timestamp': 1654376400, - 'duration': 4602.09, - 'timestamp': 1654398990, - 'release_date': '20220604', - 'tags': ['Sơn Soho', 'Linh Keen', 'Trần Tình (Naked Love) mùa 2', 'Naked Love', 'trần tình', 'ShowHot'], - 'release_timestamp': 1654376400, - 'view_count': int, - 'modified_date': '20220604', - 'title': '[4x5] FINAL - Naked Love EP4 - Moi quan he tieu cuc', - 'upload_date': '20220605', - 'display_id': 'md5:a27d1fbdeafeb740050de0e697c8e02e', - } - }, { - 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn', - 'note': 'Playlist URL with --flat-playlist', - 'info_dict': { - 'id': '316972', - 'ext': 'mp4', - 'description': 'md5:4212062bf4c447efbad5f54e6ab8d132', - 'thumbnail': 'https://videothumbs.mediacdn.vn/kenh14/203336854389633024/2022/6/5/4x5-final-naked-love-ep4-moi-quan-he-tieu-cuc-16544166609501118413296.jpg', - 'tags': ['Sơn Soho', 'Linh Keen', 'Trần Tình (Naked Love) mùa 2', 'Naked Love', 'Trần tình', 'ShowHot'], - 'title': 'Naked Love - Trần Tình tập 4: Sơn Soho, Linh Keen cùng câu chuyện bước ra khỏi mối quan hệ tiêu cực và yêu bản thân mình', - 'display_id': 'md5:db3e1b63976a664b3576f3ef68a17c9a', + 'timestamp': 1652764468, }, - 'params': {'extract_flat': 'in_playlist'} + }, { + 'url': 'https://video.kenh14.vn/video-316174.chn', + 'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd', + 'info_dict': { + 'id': '316174', + 'ext': 'mp4', + 'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu', + 'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'upload_date': '20220517', + 'view_count': int, + 'duration': 70.04, + 'timestamp': 1652766021, + }, + }, { + 'url': 'https://video.kenh14.vn/0-344740.chn', + 'md5': 'b843495d5e728142c8870c09b46df2a9', + 'info_dict': { + 'id': '344740', + 'ext': 'mov', + 'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi', + 'description': 'md5:2a2dbb4a7397169fb21ee68f09160497', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$', + 'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'], + 'uploader': 'Quang Vũ', + 'upload_date': '20241024', + 'view_count': int, + 'duration': 198.88, + 'timestamp': 1729741590, + }, + 'expected_warnings': [ + 'Failed to download m3u8 information: HTTP Error 404: NOT FOUND', + ], }] - def _try_get_redirect_url(self, webpage): - # javascript-based redirect; set via - url = self._search_regex(r"onload=\"window.location.href='([^']*)'", webpage, 'redirect url', default=None, fatal=False) - if url: - return 'https://video.kenh14.vn' + url + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - def _extract_formats_wrapper(self, video_id, direct_url): - formats = [{'url': f'https://{direct_url}'}] - formats.extend(self._extract_m3u8_formats(f'https://{direct_url}/master.m3u8', video_id)) - self._sort_formats(formats) - return formats + attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '') + direct_url = attrs['data-vid'] - def _extract_video(self, webpage, page_url, fallback_url=''): - video_id = self._match_id(page_url) - if webpage is None: - webpage, page_url = self._download_webpage_wrapper(page_url, fatal=False) - attrs = extract_attributes(self._extract_video_div(webpage)) - direct_url = attrs.get('data-vid', fallback_url) - filename = remove_start(direct_url, 'kenh14cdn.com/') - inline_metadata = self._parse_json(attrs.get('data-htmlcode', '{}'), video_id, fatal=False) - display_id = self._get_display_id(page_url) + metadata = self._download_json( + 'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format( + remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False) - formats = self._extract_formats_wrapper(video_id, direct_url) - result = { + return { 'id': video_id, 'title': ( - clean_html(self._og_search_title(webpage)) - or inline_metadata.get('video_title') - or self._extract_title(webpage)), - 'formats': formats, - 'duration': parse_duration(inline_metadata.get('video_duration')), + strip_or_none(metadata.get('title')) + or clean_html(self._og_search_title(webpage)) + or clean_html(get_element_by_class('vdbw-title', webpage))), + 'formats': [ + {'url': f'https://{direct_url}', 'format_id': 'http'}, + *self._extract_m3u8_formats(f'https://{direct_url}/master.m3u8', video_id, fatal=False), + ], + 'duration': parse_duration(metadata.get('duration')), 'description': ( clean_html(self._og_search_description(webpage)) - or clean_html(inline_metadata.get('video_description')) - or self._extract_description(webpage)), - 'thumbnail': ( - self._og_search_thumbnail(webpage) - or inline_metadata.get('video_thumb') - or attrs.get('data-thumb')), - 'release_timestamp': parse_iso8601(self._html_search_meta('article:published_time', webpage)), - 'modified_timestamp': parse_iso8601(self._html_search_meta('article:modified_time', webpage)), - 'tags': self._extract_page_tags(webpage), - 'webpage_url': page_url, - 'display_id': display_id, - } - result['timestamp'] = result.get('release_timestamp') or result.get('modified_timestamp') - - metadata = self._download_json(f'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={filename}', video_id) or {} - - return { - # Note: API result contains 'videoID' but it's a different number that doesn't seem public-facing. - 'id': video_id, - 'title': metadata.get('title') or result.get('title'), - 'formats': formats, - 'duration': parse_duration(metadata.get('duration')) or result.get('duration'), - 'timestamp': ( - parse_iso8601(metadata.get('uploadtime'), delimiter=' ') - or result.get('timestamp')), - 'release_timestamp': result.get('release_timestamp'), - 'modified_timestamp': result.get('modified_timestamp'), - 'thumbnail': metadata.get('thumbnail') or result.get('thumbnail'), - 'description': result.get('description'), - 'uploader': metadata.get('author'), + or clean_html(get_element_by_class('vdbw-sapo', webpage))), + 'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')), + 'uploader': strip_or_none(metadata.get('author')), + 'timestamp': parse_iso8601(metadata.get('uploadtime'), delimiter=' '), 'view_count': int_or_none(metadata.get('views')), - 'tags': result.get('tags'), - 'webpage_url': page_url, - 'display_id': display_id, + 'tags': traverse_obj(self._html_search_meta('keywords', webpage), ( + {lambda x: x.split(';')}, lambda _, v: v, {str_or_none})), } - def _extract_playlist(self, webpage, url): - playlist_id = self._match_id(url) - display_id = self._get_display_id(url) - def _extract_playlist_videos(): - for video_listitem in self._get_list_videos(webpage): - attrs = extract_attributes(self._extract_video_div(video_listitem)) - direct_url = attrs.get('data-vid') - if not direct_url: - self.report_warning(f'Could not find expected video in playlist {playlist_id}') - continue - - video_id = attrs.get('data-item-id') - share_url = attrs.get('data-share') - if not share_url: - share_url = f'https://video.kenh14.vn/video-{video_id}.chn' - - basename = remove_end(remove_start(direct_url, 'kenh14cdn.com/'), '.mp4') - video_result = { - '_type': 'video', - 'id': video_id, - 'webpage_url': share_url, - 'display_id': display_id, - 'title': self._extract_title(video_listitem), - 'description': self._extract_description(video_listitem), - 'thumbnail': f'https://videothumbs.mediacdn.vn/kenh14/{basename}.jpg', - 'tags': self._extract_tag_list(video_listitem), - } - - if not self.get_param('extract_flat', False): - video_result.update(self._extract_video(None, share_url, direct_url)) - else: - video_result.update({'formats': self._extract_formats_wrapper(video_id, direct_url)}) - - yield video_result - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'tags': self._extract_page_tags(webpage), - 'display_id': display_id, - 'entries': _extract_playlist_videos(), - } - pass - - def _get_list_videos(self, webpage): - # get_elements_html_by_class('video-item', webpage) is NOT doing what I - # expect; it's matching list-video-item for some reason - return get_elements_html_by_attribute('class', 'video-item', webpage) - - def _extract_video_div(self, content): - return get_element_html_by_attribute('type', 'VideoStream', content) or '' - - def _extract_title(self, elem): - return clean_html(get_element_by_class('video-title', elem) or get_element_by_class('vdbw-title', elem)) - - def _extract_description(self, elem): - return clean_html(get_element_by_class('video-sapo', elem) or get_element_by_class('vdbw-sapo', elem)) - - def _extract_tag_list(self, elem): - in_class = list(get_elements_html_by_attribute('class', 'video-tag', elem)) - in_attr = list(get_elements_by_attribute('data', 'video-tag', elem)) - return list(filter(None, map(clean_html, in_class or in_attr))) - - def _extract_page_tags(self, webpage): - tags = self._html_search_meta('article:tag', webpage) or '' - return list(filter(None, map(lambda x: x.strip(), tags.split(',')))) - - def _download_webpage_wrapper(self, url, fatal=True): - video_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, video_id, fatal=fatal) - redirect_url = self._try_get_redirect_url(webpage) - new_urlh = urlh - if redirect_url: - # Page load relies on a javascript redirect - webpage, new_urlh = self._download_webpage_handle(redirect_url, video_id, fatal=fatal) - return webpage, (new_urlh or urlh).geturl() - - def _is_playlist(self, url): - return self._search_regex( - r'https?://video.kenh14\.vn/(?Pplaylist/|)?.*\.chn', - url, 'url type and display id', default=False, group=('is_playlist'), fatal=False) - - def _get_display_id(self, url): - return self._search_regex( - r'https?://video.kenh14\.vn/(?:playlist/|)?(?P.*?)-?[0-9]+\.chn', - url, 'url type and display id', group=('display_id'), fatal=False) +class Kenh14PlaylistIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn', + 'info_dict': { + 'id': '71', + 'title': 'Trần Tình (Naked love) mùa 2', + 'description': 'md5:e9522339304956dea931722dd72eddb2', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 9, + }, { + 'url': 'https://video.kenh14.vn/playlist/0-72.chn', + 'info_dict': { + 'id': '72', + 'title': 'Lau Lại Đầu Từ', + 'description': 'Cùng xem xưa và nay có gì khác biệt nhé!', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 6, + }] def _real_extract(self, url): - webpage, url = self._download_webpage_wrapper(url) + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) - is_playlist = self._is_playlist(url) - self.write_debug('deduced page to be a ' + ('playlist' if is_playlist else 'video')) + category_detail = get_element_by_class('category-detail', webpage) + embed_info = traverse_obj( + self._yield_json_ld(webpage, playlist_id), + (lambda _, v: v['name'] and v['alternateName']), get_all=False) - if is_playlist: - return self._extract_playlist(webpage, url) - else: - return self._extract_video(webpage, url) + return self.playlist_from_matches( + get_elements_html_by_class('video-item', webpage), playlist_id, + (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))), + getter=lambda x: 'https://video.kenh14.vn/video/{}.chn'.format(extract_attributes(x)['data-id']), + ie=Kenh14VideoIE.ie_key(), playlist_description=( + clean_html(get_element_by_class('description', category_detail)) + or unescapeHTML(embed_info.get('alternateName'))), + thumbnail=traverse_obj( + self._og_search_thumbnail(webpage), + ({url_or_none}, {lambda x: urllib.parse.urlunparse(urllib.parse.urlparse(x)._replace(query=None))})))