From bfbecd1174a9e2ee08117352c26e664d36f1cc17 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Wed, 31 Aug 2022 02:07:55 +0900 Subject: [PATCH] [extractor/newspicks] Add extractor (#4725) Authored by: Lesmiscore --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/common.py | 4 +-- yt_dlp/extractor/newspicks.py | 54 +++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/newspicks.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 60e1b716f..1cded3ddf 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1083,6 +1083,7 @@ NewgroundsPlaylistIE, NewgroundsUserIE, ) +from .newspicks import NewsPicksIE from .newstube import NewstubeIE from .newsy import NewsyIE from .nextmedia import ( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f950d28ed..b79221955 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3260,7 +3260,7 @@ def _media_formats(src, cur_media_type, type_info=None): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = strip_or_none(media_attributes.get('src')) + src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source'))) if src: f = parse_content_type(media_attributes.get('type')) _, formats = _media_formats(src, media_type, f) @@ -3271,7 +3271,7 @@ def _media_formats(src, cur_media_type, type_info=None): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source'))) if not src: continue f = parse_content_type(s_attr.get('type')) diff --git a/yt_dlp/extractor/newspicks.py b/yt_dlp/extractor/newspicks.py new file mode 100644 index 000000000..0232d5357 --- /dev/null +++ b/yt_dlp/extractor/newspicks.py @@ -0,0 +1,54 @@ +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class NewsPicksIE(InfoExtractor): + _VALID_URL = r'https://newspicks.com/movie-series/(?P\d+)\?movieId=(?P\d+)' + + _TESTS = [{ + 'url': 'https://newspicks.com/movie-series/11?movieId=1813', + 'info_dict': { + 'id': '1813', + 'title': '日本の課題を破壊せよ【ゲスト:成田悠輔】', + 'description': 'md5:09397aad46d6ded6487ff13f138acadf', + 'channel': 'HORIE ONE', + 'channel_id': '11', + 'release_date': '20220117', + 'thumbnail': r're:https://.+jpg', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id, channel_id = self._match_valid_url(url).group('id', 'channel_id') + webpage = self._download_webpage(url, video_id) + entries = self._parse_html5_media_entries( + url, webpage.replace('movie-for-pc', 'movie'), video_id, 'hls') + if not entries: + raise ExtractorError('No HTML5 media elements found') + info = entries[0] + self._sort_formats(info['formats']) + + title = self._html_search_meta('og:title', webpage, fatal=False) + description = self._html_search_meta( + ('og:description', 'twitter:title'), webpage, fatal=False) + channel = self._html_search_regex( + r'value="11".+?(.+?)\s*(\d+)年(\d+)月(\d+)日\s*', + webpage, 'release date', fatal=False, group=(1, 2, 3)) + + info.update({ + 'id': video_id, + 'title': title, + 'description': description, + 'channel': channel, + 'channel_id': channel_id, + 'release_date': ('%04d%02d%02d' % tuple(map(int, release_date))) if release_date else None, + }) + return info