mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-09-28 13:47:53 +00:00
split extractors
This commit is contained in:
parent
83e2e40790
commit
5b35ca7333
|
@ -1906,7 +1906,12 @@
|
||||||
from .syfy import SyfyIE
|
from .syfy import SyfyIE
|
||||||
from .sztvhu import SztvHuIE
|
from .sztvhu import SztvHuIE
|
||||||
from .tagesschau import TagesschauIE
|
from .tagesschau import TagesschauIE
|
||||||
from .taptap import TapTapIE
|
from .taptap import (
|
||||||
|
TapTapMomentIE,
|
||||||
|
TapTapAppIE,
|
||||||
|
TapTapAppIntlIE,
|
||||||
|
TapTapPostIntlIE,
|
||||||
|
)
|
||||||
from .tass import TassIE
|
from .tass import TassIE
|
||||||
from .tbs import TBSIE
|
from .tbs import TBSIE
|
||||||
from .tbsjp import (
|
from .tbsjp import (
|
||||||
|
|
|
@ -1,20 +1,84 @@
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
clean_html,
|
clean_html,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
parse_qs,
|
join_nonempty,
|
||||||
str_or_none,
|
|
||||||
traverse_obj,
|
traverse_obj,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TapTapIE(InfoExtractor):
|
class TapTapBaseIE(InfoExtractor):
|
||||||
_VALID_URL = r'https?://www\.taptap\.cn/(?P<section>moment|app)/(?P<id>\d+)'
|
_X_UA = 'V=1&PN=WebApp&LANG=zh_CN&VN_CODE=102&LOC=CN&PLT=PC&DS=Android&UID={uuid}&OS=Windows&OSV=10&DT=PC'
|
||||||
|
_VIDEO_API = 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get'
|
||||||
|
_INFO_API = None
|
||||||
|
_INFO_QUERY_KEY = 'id'
|
||||||
|
_DATA_PATH = ('data')
|
||||||
|
_ID_PATH = None
|
||||||
|
_META_PATH = None
|
||||||
|
|
||||||
|
def _get_api(self, url, video_id, query, **kwargs):
|
||||||
|
rand_hex = lambda digits: ''.join(f'{random.randint(0, 15):x}' for _ in range(digits))
|
||||||
|
uuid = '-'.join(rand_hex(digits) for digits in [8, 4, 4, 4, 12])
|
||||||
|
query = {**query, 'X-UA': self._X_UA.format(uuid=uuid)}
|
||||||
|
return self._download_json(url, video_id, query=query, **kwargs)
|
||||||
|
|
||||||
|
def _extract_video(self, video_id, is_intl=False):
|
||||||
|
video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['data']['list'][0]
|
||||||
|
|
||||||
|
video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}))[0]
|
||||||
|
formats = self._extract_m3u8_formats(video_url, video_id)
|
||||||
|
for format in formats:
|
||||||
|
if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')):
|
||||||
|
format['format_id'] = join_nonempty(format.get('format_id'), 'h265', delim='_')
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': str(video_id),
|
||||||
|
'formats': formats,
|
||||||
|
**traverse_obj(video_data, ({
|
||||||
|
'duration': ('info', 'duration', {int_or_none}),
|
||||||
|
'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}),
|
||||||
|
}), get_all=False)
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_entries(self, video_ids, metainfo, list_id):
|
||||||
|
entries = [{**metainfo, **self._extract_video(id)} for id in set(video_ids)]
|
||||||
|
return self.playlist_result(entries, **metainfo, id=list_id)
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id = self._match_id(url)
|
||||||
|
query = {self._INFO_QUERY_KEY: video_id}
|
||||||
|
|
||||||
|
data = traverse_obj(
|
||||||
|
self._get_api(self._INFO_API, video_id, query=query), self._DATA_PATH)
|
||||||
|
|
||||||
|
video_ids = traverse_obj(data, self._ID_PATH)
|
||||||
|
metainfo = traverse_obj(data, self._META_PATH)
|
||||||
|
return self._extract_entries(video_ids, metainfo, video_id)
|
||||||
|
|
||||||
|
|
||||||
|
class TapTapIntlBase(TapTapBaseIE):
|
||||||
|
_X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0'
|
||||||
|
_VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get'
|
||||||
|
|
||||||
|
|
||||||
|
class TapTapMomentIE(TapTapBaseIE):
|
||||||
|
_VALID_URL = r'https?://www\.taptap\.cn/moment/(?P<id>\d+)'
|
||||||
|
_INFO_API = 'https://www.taptap.cn/webapiv2/moment/v3/detail'
|
||||||
|
_ID_PATH = ('moment', 'topic', (('videos', ...), 'pin_video'), 'video_id')
|
||||||
|
_META_PATH = ('moment', {
|
||||||
|
'timestamp': ('created_time', {int_or_none}),
|
||||||
|
'uploader': ('author', 'user', 'name', {str}),
|
||||||
|
'title': ('topic', 'title', {str}),
|
||||||
|
'description': ('topic', 'summary', {str}),
|
||||||
|
})
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://www.taptap.cn/moment/194618230982052443',
|
'url': 'https://www.taptap.cn/moment/194618230982052443',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'moment_194618230982052443',
|
'id': '194618230982052443',
|
||||||
'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星',
|
'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星',
|
||||||
'description': 'md5:cf66f7819d413641b8b28c8543f4ecda',
|
'description': 'md5:cf66f7819d413641b8b28c8543f4ecda',
|
||||||
'timestamp': 1633453402,
|
'timestamp': 1633453402,
|
||||||
|
@ -38,7 +102,7 @@ class TapTapIE(InfoExtractor):
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.taptap.cn/moment/521630629209573493',
|
'url': 'https://www.taptap.cn/moment/521630629209573493',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'moment_521630629209573493',
|
'id': '521630629209573493',
|
||||||
'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」',
|
'title': '《崩坏:星穹铁道》黄泉角色PV——「你的颜色」',
|
||||||
'description': 'md5:2c81245da864428c904d53ae4ad2182b',
|
'description': 'md5:2c81245da864428c904d53ae4ad2182b',
|
||||||
'timestamp': 1711425600,
|
'timestamp': 1711425600,
|
||||||
|
@ -59,10 +123,21 @@ class TapTapIE(InfoExtractor):
|
||||||
'thumbnail': r're:^https?://.*\.(png|jpg)',
|
'thumbnail': r're:^https?://.*\.(png|jpg)',
|
||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
}, {
|
}]
|
||||||
|
|
||||||
|
|
||||||
|
class TapTapAppIE(TapTapBaseIE):
|
||||||
|
_VALID_URL = r'https?://www\.taptap\.cn/app/(?P<id>\d+)'
|
||||||
|
_INFO_API = 'https://www.taptap.cn/webapiv2/app/v4/detail'
|
||||||
|
_ID_PATH = (('app_videos', 'videos'), ..., 'video_id')
|
||||||
|
_META_PATH = {
|
||||||
|
'title': ('title', {str}),
|
||||||
|
'description': ('description', 'text', {str}, {clean_html}),
|
||||||
|
}
|
||||||
|
_TESTS = [{
|
||||||
'url': 'https://www.taptap.cn/app/168332',
|
'url': 'https://www.taptap.cn/app/168332',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'app_168332',
|
'id': '168332',
|
||||||
'title': '原神',
|
'title': '原神',
|
||||||
'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab',
|
'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab',
|
||||||
},
|
},
|
||||||
|
@ -86,63 +161,49 @@ class TapTapIE(InfoExtractor):
|
||||||
'thumbnail': r're:^https?://.*\.(png|jpg)',
|
'thumbnail': r're:^https?://.*\.(png|jpg)',
|
||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
|
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _deserialize_nuxt_data(self, serialized_nuxt):
|
|
||||||
for row in serialized_nuxt:
|
|
||||||
if isinstance(row, dict):
|
|
||||||
for key, value_or_ref in row.items():
|
|
||||||
if isinstance(value_or_ref, int):
|
|
||||||
row[key] = serialized_nuxt[value_or_ref]
|
|
||||||
elif isinstance(row, list):
|
|
||||||
for index, value_or_ref in tuple(enumerate(row)):
|
|
||||||
if isinstance(value_or_ref, int):
|
|
||||||
row[index] = serialized_nuxt[value_or_ref]
|
|
||||||
return serialized_nuxt[0]
|
|
||||||
|
|
||||||
def _extract_video(self, video_id, x_ua):
|
class TapTapAppIntlIE(TapTapAppIE, TapTapIntlBase):
|
||||||
data = self._download_json(
|
_VALID_URL = r'https?://www\.taptap\.io/app/(?P<id>\d+)'
|
||||||
'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get', video_id,
|
_INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail'
|
||||||
query={'video_ids': video_id, 'X-UA': x_ua})
|
_DATA_PATH = ('data', 'app')
|
||||||
|
|
||||||
video = traverse_obj(data, ('data', 'list', 0, {
|
|
||||||
'id': ('video_id', {str_or_none}),
|
|
||||||
'url': ('play_url', ('url', 'url_h265'), {url_or_none}),
|
|
||||||
'duration': ('info', 'duration', {int_or_none}),
|
|
||||||
'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}),
|
|
||||||
}), get_all=False)
|
|
||||||
if '.m3u8' in video['url']:
|
|
||||||
video['formats'] = self._extract_m3u8_formats(video.pop('url'), video_id)
|
|
||||||
return video
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
class TapTapPostIntlIE(TapTapAppIntlIE):
|
||||||
section, list_id = self._match_valid_url(url).groups()
|
_VALID_URL = r'https?://www\.taptap\.io/post/(?P<id>\d+)'
|
||||||
list_id = f'{section}_{list_id}'
|
_INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail'
|
||||||
|
_INFO_QUERY_KEY = 'id_str'
|
||||||
webpage = self._download_webpage(url, list_id)
|
_DATA_PATH = ('data', 'post')
|
||||||
nuxt_data = self._deserialize_nuxt_data(self._search_json(
|
_ID_PATH = ((('videos', ...), 'pin_video'), 'video_id')
|
||||||
r'<script[^>]+\bid=["\']__NUXT_DATA__["\'][^>]*>', webpage,
|
_META_PATH = {
|
||||||
'nuxt data', list_id, contains_pattern=r'\[(?s:.+)\]'))[1]
|
'timestamp': ('published_time', {int_or_none}),
|
||||||
x_ua = traverse_obj(nuxt_data, (
|
'uploader': ('user', 'name', {str}),
|
||||||
'state', '$sbff', ..., {lambda x: parse_qs(x)['X-UA']}, ...), get_all=False)
|
|
||||||
|
|
||||||
if section == 'moment':
|
|
||||||
moment_data = traverse_obj(nuxt_data, ('data', ..., 'moment'), get_all=False)
|
|
||||||
video_ids = traverse_obj(moment_data, ('topic', (('videos', ...), 'pin_video'), 'video_id'))
|
|
||||||
metainfo = traverse_obj(moment_data, {
|
|
||||||
'timestamp': ('created_time', {int_or_none}),
|
|
||||||
'uploader': ('author', 'user', 'name', {str}),
|
|
||||||
'title': ('topic', 'title', {str}),
|
|
||||||
'description': ('topic', 'summary', {str}),
|
|
||||||
})
|
|
||||||
elif section == 'app':
|
|
||||||
video_ids = traverse_obj(nuxt_data, ('data', ..., ('app_videos', 'videos'), ..., 'video_id'))
|
|
||||||
metainfo = traverse_obj(nuxt_data, ('data', ..., {
|
|
||||||
'title': ('title', {str}),
|
'title': ('title', {str}),
|
||||||
'description': ('description', 'text', {str}, {clean_html}),
|
'description': ('list_fields', 'summary', {str}),
|
||||||
}), get_all=False)
|
}
|
||||||
|
_TESTS = [{
|
||||||
entries = [self._extract_video(video_id, x_ua) for video_id in set(video_ids)]
|
'url': 'https://www.taptap.io/post/571785',
|
||||||
|
'info_dict': {
|
||||||
return self.playlist_result([{**metainfo, **e} for e in entries], **metainfo, id=list_id)
|
'id': '571785',
|
||||||
|
'title': 'Arknights x Rainbow Six Siege | Event PV',
|
||||||
|
'description': 'md5:f7717c13f6d3108e22db7303e6690bf7',
|
||||||
|
'timestamp': 1614664951,
|
||||||
|
'upload_date': '20210302',
|
||||||
|
'uploader': 'TapTap Editor',
|
||||||
|
},
|
||||||
|
'playlist_count': 1,
|
||||||
|
'playlist': [{
|
||||||
|
'info_dict': {
|
||||||
|
'id': '2149491903',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Arknights x Rainbow Six Siege | Event PV',
|
||||||
|
'description': 'md5:f7717c13f6d3108e22db7303e6690bf7',
|
||||||
|
'duration': 122,
|
||||||
|
'timestamp': 1614664951,
|
||||||
|
'upload_date': '20210302',
|
||||||
|
'uploader': 'TapTap Editor',
|
||||||
|
'thumbnail': r're:^https?://.*\.(png|jpg)',
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
|
Loading…
Reference in a new issue