split extractors

This commit is contained in:
c-basalt 2024-05-10 23:18:27 -04:00
parent 83e2e40790
commit 5b35ca7333
2 changed files with 130 additions and 64 deletions

View File

@ -1906,7 +1906,12 @@ from .syvdk import SYVDKIE
from .syfy import SyfyIE from .syfy import SyfyIE
from .sztvhu import SztvHuIE from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE from .tagesschau import TagesschauIE
from .taptap import TapTapIE from .taptap import (
TapTapMomentIE,
TapTapAppIE,
TapTapAppIntlIE,
TapTapPostIntlIE,
)
from .tass import TassIE from .tass import TassIE
from .tbs import TBSIE from .tbs import TBSIE
from .tbsjp import ( from .tbsjp import (

View File

@ -1,20 +1,84 @@
import random
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html, clean_html,
int_or_none, int_or_none,
parse_qs, join_nonempty,
str_or_none,
traverse_obj, traverse_obj,
url_or_none, url_or_none,
) )
class TapTapIE(InfoExtractor): class TapTapBaseIE(InfoExtractor):
_VALID_URL = r'https?://www\.taptap\.cn/(?P<section>moment|app)/(?P<id>\d+)' _X_UA = 'V=1&PN=WebApp&LANG=zh_CN&VN_CODE=102&LOC=CN&PLT=PC&DS=Android&UID={uuid}&OS=Windows&OSV=10&DT=PC'
_VIDEO_API = 'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get'
_INFO_API = None
_INFO_QUERY_KEY = 'id'
_DATA_PATH = ('data')
_ID_PATH = None
_META_PATH = None
def _get_api(self, url, video_id, query, **kwargs):
rand_hex = lambda digits: ''.join(f'{random.randint(0, 15):x}' for _ in range(digits))
uuid = '-'.join(rand_hex(digits) for digits in [8, 4, 4, 4, 12])
query = {**query, 'X-UA': self._X_UA.format(uuid=uuid)}
return self._download_json(url, video_id, query=query, **kwargs)
def _extract_video(self, video_id, is_intl=False):
video_data = self._get_api(self._VIDEO_API, video_id, query={'video_ids': video_id})['data']['list'][0]
video_url = traverse_obj(video_data, ('play_url', ('url_h265', 'url'), {url_or_none}))[0]
formats = self._extract_m3u8_formats(video_url, video_id)
for format in formats:
if re.search(r'^(hev|hvc|hvt)\d', format.get('vcodec', '')):
format['format_id'] = join_nonempty(format.get('format_id'), 'h265', delim='_')
return {
'id': str(video_id),
'formats': formats,
**traverse_obj(video_data, ({
'duration': ('info', 'duration', {int_or_none}),
'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}),
}), get_all=False)
}
def _extract_entries(self, video_ids, metainfo, list_id):
entries = [{**metainfo, **self._extract_video(id)} for id in set(video_ids)]
return self.playlist_result(entries, **metainfo, id=list_id)
def _real_extract(self, url):
video_id = self._match_id(url)
query = {self._INFO_QUERY_KEY: video_id}
data = traverse_obj(
self._get_api(self._INFO_API, video_id, query=query), self._DATA_PATH)
video_ids = traverse_obj(data, self._ID_PATH)
metainfo = traverse_obj(data, self._META_PATH)
return self._extract_entries(video_ids, metainfo, video_id)
class TapTapIntlBase(TapTapBaseIE):
_X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0'
_VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get'
class TapTapMomentIE(TapTapBaseIE):
_VALID_URL = r'https?://www\.taptap\.cn/moment/(?P<id>\d+)'
_INFO_API = 'https://www.taptap.cn/webapiv2/moment/v3/detail'
_ID_PATH = ('moment', 'topic', (('videos', ...), 'pin_video'), 'video_id')
_META_PATH = ('moment', {
'timestamp': ('created_time', {int_or_none}),
'uploader': ('author', 'user', 'name', {str}),
'title': ('topic', 'title', {str}),
'description': ('topic', 'summary', {str}),
})
_TESTS = [{ _TESTS = [{
'url': 'https://www.taptap.cn/moment/194618230982052443', 'url': 'https://www.taptap.cn/moment/194618230982052443',
'info_dict': { 'info_dict': {
'id': 'moment_194618230982052443', 'id': '194618230982052443',
'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星', 'title': '《崩坏3》开放世界「后崩坏书」新篇章 于淹没之地仰视辰星',
'description': 'md5:cf66f7819d413641b8b28c8543f4ecda', 'description': 'md5:cf66f7819d413641b8b28c8543f4ecda',
'timestamp': 1633453402, 'timestamp': 1633453402,
@ -38,7 +102,7 @@ class TapTapIE(InfoExtractor):
}, { }, {
'url': 'https://www.taptap.cn/moment/521630629209573493', 'url': 'https://www.taptap.cn/moment/521630629209573493',
'info_dict': { 'info_dict': {
'id': 'moment_521630629209573493', 'id': '521630629209573493',
'title': '《崩坏星穹铁道》黄泉角色PV——「你的颜色」', 'title': '《崩坏星穹铁道》黄泉角色PV——「你的颜色」',
'description': 'md5:2c81245da864428c904d53ae4ad2182b', 'description': 'md5:2c81245da864428c904d53ae4ad2182b',
'timestamp': 1711425600, 'timestamp': 1711425600,
@ -59,10 +123,21 @@ class TapTapIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.(png|jpg)', 'thumbnail': r're:^https?://.*\.(png|jpg)',
} }
}] }]
}, { }]
class TapTapAppIE(TapTapBaseIE):
_VALID_URL = r'https?://www\.taptap\.cn/app/(?P<id>\d+)'
_INFO_API = 'https://www.taptap.cn/webapiv2/app/v4/detail'
_ID_PATH = (('app_videos', 'videos'), ..., 'video_id')
_META_PATH = {
'title': ('title', {str}),
'description': ('description', 'text', {str}, {clean_html}),
}
_TESTS = [{
'url': 'https://www.taptap.cn/app/168332', 'url': 'https://www.taptap.cn/app/168332',
'info_dict': { 'info_dict': {
'id': 'app_168332', 'id': '168332',
'title': '原神', 'title': '原神',
'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab', 'description': 'md5:e345f39a5fea5de2a46923f70d5f76ab',
}, },
@ -86,63 +161,49 @@ class TapTapIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.(png|jpg)', 'thumbnail': r're:^https?://.*\.(png|jpg)',
} }
}] }]
}] }]
def _deserialize_nuxt_data(self, serialized_nuxt):
for row in serialized_nuxt:
if isinstance(row, dict):
for key, value_or_ref in row.items():
if isinstance(value_or_ref, int):
row[key] = serialized_nuxt[value_or_ref]
elif isinstance(row, list):
for index, value_or_ref in tuple(enumerate(row)):
if isinstance(value_or_ref, int):
row[index] = serialized_nuxt[value_or_ref]
return serialized_nuxt[0]
def _extract_video(self, video_id, x_ua): class TapTapAppIntlIE(TapTapAppIE, TapTapIntlBase):
data = self._download_json( _VALID_URL = r'https?://www\.taptap\.io/app/(?P<id>\d+)'
'https://www.taptap.cn/webapiv2/video-resource/v1/multi-get', video_id, _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail'
query={'video_ids': video_id, 'X-UA': x_ua}) _DATA_PATH = ('data', 'app')
video = traverse_obj(data, ('data', 'list', 0, {
'id': ('video_id', {str_or_none}),
'url': ('play_url', ('url', 'url_h265'), {url_or_none}),
'duration': ('info', 'duration', {int_or_none}),
'thumbnail': ('thumbnail', ('original_url', 'url'), {url_or_none}),
}), get_all=False)
if '.m3u8' in video['url']:
video['formats'] = self._extract_m3u8_formats(video.pop('url'), video_id)
return video
def _real_extract(self, url): class TapTapPostIntlIE(TapTapAppIntlIE):
section, list_id = self._match_valid_url(url).groups() _VALID_URL = r'https?://www\.taptap\.io/post/(?P<id>\d+)'
list_id = f'{section}_{list_id}' _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail'
_INFO_QUERY_KEY = 'id_str'
webpage = self._download_webpage(url, list_id) _DATA_PATH = ('data', 'post')
nuxt_data = self._deserialize_nuxt_data(self._search_json( _ID_PATH = ((('videos', ...), 'pin_video'), 'video_id')
r'<script[^>]+\bid=["\']__NUXT_DATA__["\'][^>]*>', webpage, _META_PATH = {
'nuxt data', list_id, contains_pattern=r'\[(?s:.+)\]'))[1] 'timestamp': ('published_time', {int_or_none}),
x_ua = traverse_obj(nuxt_data, ( 'uploader': ('user', 'name', {str}),
'state', '$sbff', ..., {lambda x: parse_qs(x)['X-UA']}, ...), get_all=False) 'title': ('title', {str}),
'description': ('list_fields', 'summary', {str}),
if section == 'moment': }
moment_data = traverse_obj(nuxt_data, ('data', ..., 'moment'), get_all=False) _TESTS = [{
video_ids = traverse_obj(moment_data, ('topic', (('videos', ...), 'pin_video'), 'video_id')) 'url': 'https://www.taptap.io/post/571785',
metainfo = traverse_obj(moment_data, { 'info_dict': {
'timestamp': ('created_time', {int_or_none}), 'id': '571785',
'uploader': ('author', 'user', 'name', {str}), 'title': 'Arknights x Rainbow Six Siege | Event PV',
'title': ('topic', 'title', {str}), 'description': 'md5:f7717c13f6d3108e22db7303e6690bf7',
'description': ('topic', 'summary', {str}), 'timestamp': 1614664951,
}) 'upload_date': '20210302',
elif section == 'app': 'uploader': 'TapTap Editor',
video_ids = traverse_obj(nuxt_data, ('data', ..., ('app_videos', 'videos'), ..., 'video_id')) },
metainfo = traverse_obj(nuxt_data, ('data', ..., { 'playlist_count': 1,
'title': ('title', {str}), 'playlist': [{
'description': ('description', 'text', {str}, {clean_html}), 'info_dict': {
}), get_all=False) 'id': '2149491903',
'ext': 'mp4',
entries = [self._extract_video(video_id, x_ua) for video_id in set(video_ids)] 'title': 'Arknights x Rainbow Six Siege | Event PV',
'description': 'md5:f7717c13f6d3108e22db7303e6690bf7',
return self.playlist_result([{**metainfo, **e} for e in entries], **metainfo, id=list_id) 'duration': 122,
'timestamp': 1614664951,
'upload_date': '20210302',
'uploader': 'TapTap Editor',
'thumbnail': r're:^https?://.*\.(png|jpg)',
}
}]
}]