[ie/Douyin] Fix extractor (#9239)

Closes #7854, Closes #7941
Authored by: 114514ns, bashonly

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
This commit is contained in:
114514ns 2024-02-28 10:30:58 +08:00 committed by GitHub
parent e28e135d6f
commit 9ff9466455
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -6,7 +6,7 @@
import time import time
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse from ..compat import compat_urllib_parse_urlparse
from ..networking import HEADRequest from ..networking import HEADRequest
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
@ -15,7 +15,6 @@
UserNotLive, UserNotLive,
determine_ext, determine_ext,
format_field, format_field,
get_first,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
merge_dicts, merge_dicts,
@ -219,8 +218,8 @@ def audio_meta(url):
def extract_addr(addr, add_meta={}): def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', '')) parsed_meta, res = parse_url_key(addr.get('url_key', ''))
if res: if res:
known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height')) known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width')) known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
parsed_meta.update(known_resolutions.get(res, {})) parsed_meta.update(known_resolutions.get(res, {}))
add_meta.setdefault('height', int_or_none(res[:-1])) add_meta.setdefault('height', int_or_none(res[:-1]))
return [{ return [{
@ -237,22 +236,26 @@ def extract_addr(addr, add_meta={}):
# Hack: Add direct video links first to prioritize them when removing duplicate formats # Hack: Add direct video links first to prioritize them when removing duplicate formats
formats = [] formats = []
width = int_or_none(video_info.get('width'))
height = int_or_none(video_info.get('height'))
if video_info.get('play_addr'): if video_info.get('play_addr'):
formats.extend(extract_addr(video_info['play_addr'], { formats.extend(extract_addr(video_info['play_addr'], {
'format_id': 'play_addr', 'format_id': 'play_addr',
'format_note': 'Direct video', 'format_note': 'Direct video',
'vcodec': 'h265' if traverse_obj( 'vcodec': 'h265' if traverse_obj(
video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
'width': video_info.get('width'), 'width': width,
'height': video_info.get('height'), 'height': height,
})) }))
if video_info.get('download_addr'): if video_info.get('download_addr'):
formats.extend(extract_addr(video_info['download_addr'], { download_addr = video_info['download_addr']
dl_width = int_or_none(download_addr.get('width'))
formats.extend(extract_addr(download_addr, {
'format_id': 'download_addr', 'format_id': 'download_addr',
'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''), 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
'vcodec': 'h264', 'vcodec': 'h264',
'width': video_info.get('width'), 'width': dl_width or width,
'height': video_info.get('height'), 'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong
'preference': -2 if video_info.get('has_watermark') else -1, 'preference': -2 if video_info.get('has_watermark') else -1,
})) }))
if video_info.get('play_addr_h264'): if video_info.get('play_addr_h264'):
@ -921,20 +924,23 @@ class DouyinIE(TikTokBaseIE):
_VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.douyin.com/video/6961737553342991651', 'url': 'https://www.douyin.com/video/6961737553342991651',
'md5': 'a97db7e3e67eb57bf40735c022ffa228', 'md5': '9ecce7bc5b302601018ecb2871c63a75',
'info_dict': { 'info_dict': {
'id': '6961737553342991651', 'id': '6961737553342991651',
'ext': 'mp4', 'ext': 'mp4',
'title': '#杨超越 小小水手带你去远航❤️', 'title': '#杨超越 小小水手带你去远航❤️',
'description': '#杨超越 小小水手带你去远航❤️', 'description': '#杨超越 小小水手带你去远航❤️',
'uploader': '6897520xka',
'uploader_id': '110403406559', 'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越', 'creator': '杨超越',
'duration': 19782, 'creators': ['杨超越'],
'duration': 19,
'timestamp': 1620905839, 'timestamp': 1620905839,
'upload_date': '20210513', 'upload_date': '20210513',
'track': '@杨超越创作的原声', 'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
@ -943,20 +949,23 @@ class DouyinIE(TikTokBaseIE):
}, },
}, { }, {
'url': 'https://www.douyin.com/video/6982497745948921092', 'url': 'https://www.douyin.com/video/6982497745948921092',
'md5': '34a87ebff3833357733da3fe17e37c0e', 'md5': '15c5e660b7048af3707304e3cc02bbb5',
'info_dict': { 'info_dict': {
'id': '6982497745948921092', 'id': '6982497745948921092',
'ext': 'mp4', 'ext': 'mp4',
'title': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'description': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'uploader': '0731chaoyue',
'uploader_id': '408654318141572', 'uploader_id': '408654318141572',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'creator': '杨超越工作室', 'creator': '杨超越工作室',
'duration': 42479, 'creators': ['杨超越工作室'],
'duration': 42,
'timestamp': 1625739481, 'timestamp': 1625739481,
'upload_date': '20210708', 'upload_date': '20210708',
'track': '@杨超越工作室创作的原声', 'track': '@杨超越工作室创作的原声',
'artists': ['杨超越工作室'],
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
@ -965,20 +974,23 @@ class DouyinIE(TikTokBaseIE):
}, },
}, { }, {
'url': 'https://www.douyin.com/video/6953975910773099811', 'url': 'https://www.douyin.com/video/6953975910773099811',
'md5': 'dde3302460f19db59c47060ff013b902', 'md5': '0e6443758b8355db9a3c34864a4276be',
'info_dict': { 'info_dict': {
'id': '6953975910773099811', 'id': '6953975910773099811',
'ext': 'mp4', 'ext': 'mp4',
'title': '#一起看海 出现在你的夏日里', 'title': '#一起看海 出现在你的夏日里',
'description': '#一起看海 出现在你的夏日里', 'description': '#一起看海 出现在你的夏日里',
'uploader': '6897520xka',
'uploader_id': '110403406559', 'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越', 'creator': '杨超越',
'duration': 17343, 'creators': ['杨超越'],
'duration': 17,
'timestamp': 1619098692, 'timestamp': 1619098692,
'upload_date': '20210422', 'upload_date': '20210422',
'track': '@杨超越创作的原声', 'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
@ -1004,20 +1016,23 @@ class DouyinIE(TikTokBaseIE):
'skip': 'No longer available', 'skip': 'No longer available',
}, { }, {
'url': 'https://www.douyin.com/video/6963263655114722595', 'url': 'https://www.douyin.com/video/6963263655114722595',
'md5': 'cf9f11f0ec45d131445ec2f06766e122', 'md5': '1440bcf59d8700f8e014da073a4dfea8',
'info_dict': { 'info_dict': {
'id': '6963263655114722595', 'id': '6963263655114722595',
'ext': 'mp4', 'ext': 'mp4',
'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'uploader': '6897520xka',
'uploader_id': '110403406559', 'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越', 'creator': '杨超越',
'duration': 15115, 'creators': ['杨超越'],
'duration': 15,
'timestamp': 1621261163, 'timestamp': 1621261163,
'upload_date': '20210517', 'upload_date': '20210517',
'track': '@杨超越创作的原声', 'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
@ -1025,34 +1040,23 @@ class DouyinIE(TikTokBaseIE):
'thumbnail': r're:https?://.+\.jpe?g', 'thumbnail': r're:https?://.+\.jpe?g',
}, },
}] }]
_APP_VERSIONS = [('23.3.0', '230300')]
_APP_NAME = 'aweme'
_AID = 1128
_API_HOSTNAME = 'aweme.snssdk.com'
_UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s' _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
_WEBPAGE_HOST = 'https://www.douyin.com/' _WEBPAGE_HOST = 'https://www.douyin.com/'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
try: detail = traverse_obj(self._download_json(
return self._extract_aweme_app(video_id) 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
except ExtractorError as e: 'Downloading web detail JSON', 'Failed to download web detail JSON',
e.expected = True query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
self.to_screen(f'{e}; trying with webpage') if not detail:
webpage = self._download_webpage(url, video_id)
render_data = self._search_json(
r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id,
contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote)
if not render_data:
# TODO: Run verification challenge code to generate signature cookies # TODO: Run verification challenge code to generate signature cookies
cookies = self._get_cookies(self._WEBPAGE_HOST)
expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid')
raise ExtractorError( raise ExtractorError(
'Fresh cookies (not necessarily logged in) are needed', expected=expected) 'Fresh cookies (not necessarily logged in) are needed',
expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id) return self._parse_aweme_video_app(detail)
class TikTokVMIE(InfoExtractor): class TikTokVMIE(InfoExtractor):