From 4f8b70b5933e991c0e4eb75f9e51c5afa69e30ca Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 23 Aug 2021 18:15:01 +0530 Subject: [PATCH] [TikTok] Fix metadata extraction --- yt_dlp/extractor/tiktok.py | 171 ++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 97 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index a0f0ae09c..9b5c3d3a9 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -11,125 +11,102 @@ from ..utils import ( ) -class TikTokBaseIE(InfoExtractor): - def _extract_aweme(self, props_data, webpage, url): - video_data = try_get(props_data, lambda x: x['pageProps'], expected_type=dict) - video_info = try_get( - video_data, lambda x: x['itemInfo']['itemStruct'], dict) - author_info = try_get( - video_data, lambda x: x['itemInfo']['itemStruct']['author'], dict) or {} - share_info = try_get(video_data, lambda x: x['itemInfo']['shareMeta'], dict) or {} - - unique_id = str_or_none(author_info.get('uniqueId')) - timestamp = try_get(video_info, lambda x: int(x['createTime']), int) - date = datetime.fromtimestamp(timestamp).strftime('%Y%m%d') - - height = try_get(video_info, lambda x: x['video']['height'], int) - width = try_get(video_info, lambda x: x['video']['width'], int) - thumbnails = [] - thumbnails.append({ - 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage), - 'width': width, - 'height': height - }) - - url = '' - if not url: - url = try_get(video_info, lambda x: x['video']['playAddr']) - if not url: - url = try_get(video_info, lambda x: x['video']['downloadAddr']) - formats = [] - formats.append({ - 'url': url, - 'ext': 'mp4', - 'height': height, - 'width': width - }) - - tracker = try_get(props_data, lambda x: x['initialProps']['$wid']) - return { - 'comment_count': int_or_none(video_info.get('commentCount')), - 'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int), - 'height': height, - 'id': str_or_none(video_info.get('id')), - 'like_count': int_or_none(video_info.get('diggCount')), - 'repost_count': int_or_none(video_info.get('shareCount')), - 'thumbnail': try_get(video_info, lambda x: x['covers'][0]), - 'timestamp': timestamp, - 'width': width, - 'title': str_or_none(share_info.get('title')) or self._og_search_title(webpage), - 'creator': str_or_none(author_info.get('nickName')), - 'uploader': unique_id, - 'uploader_id': str_or_none(author_info.get('userId')), - 'uploader_url': 'https://www.tiktok.com/@' + unique_id, - 'thumbnails': thumbnails, - 'upload_date': date, - 'webpage_url': self._og_search_url(webpage), - 'description': str_or_none(video_info.get('text')) or str_or_none(share_info.get('desc')), - 'ext': 'mp4', - 'formats': formats, - 'http_headers': { - 'Referer': url, - 'Cookie': 'tt_webid=%s; tt_webid_v2=%s' % (tracker, tracker), - } - } - - -class TikTokIE(TikTokBaseIE): +class TikTokIE(InfoExtractor): _VALID_URL = r'https?://www\.tiktok\.com/@[\w\._]+/video/(?P\d+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', 'md5': '34a7543afd5a151b0840ba6736fb633b', 'info_dict': { - 'comment_count': int, - 'creator': 'facestoriesbyleenabh', - 'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95', - 'duration': 13, - 'ext': 'mp4', - 'formats': list, - 'height': 1280, 'id': '6748451240264420610', - 'like_count': int, - 'repost_count': int, - 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', - 'thumbnails': list, - 'timestamp': 1571246252, - 'title': 'facestoriesbyleenabh on TikTok', - 'upload_date': '20191016', + 'ext': 'mp4', + 'title': '#jassmanak #lehanga #leenabhushan', + 'description': '#jassmanak #lehanga #leenabhushan', + 'duration': 13, + 'height': 1280, + 'width': 720, 'uploader': 'leenabhushan', 'uploader_id': '6691488002098119685', - 'uploader_url': r're:https://www.tiktok.com/@leenabhushan', - 'webpage_url': r're:https://www.tiktok.com/@leenabhushan/(video/)?6748451240264420610', - 'width': 720, + 'uploader_url': 'https://www.tiktok.com/@leenabhushan', + 'creator': 'facestoriesbyleenabh', + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20191016', + 'timestamp': 1571246252, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, } }, { 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', 'md5': '06b9800d47d5fe51a19e322dd86e61c9', 'info_dict': { - 'comment_count': int, - 'creator': 'patroX', + 'id': '6742501081818877190', + 'ext': 'mp4', + 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94', 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94', 'duration': 27, - 'ext': 'mp4', - 'formats': list, 'height': 960, - 'id': '6742501081818877190', + 'width': 540, + 'uploader': 'patrox', + 'uploader_id': '18702747', + 'uploader_url': 'https://www.tiktok.com/@patrox', + 'creator': 'patroX', + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20190930', + 'timestamp': 1569860870, + 'view_count': int, 'like_count': int, 'repost_count': int, - 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', - 'thumbnails': list, - 'timestamp': 1569860870, - 'title': 'patroX on TikTok', - 'upload_date': '20190930', - 'uploader': 'patroxofficial', - 'uploader_id': '18702747', - 'uploader_url': r're:https://www.tiktok.com/@patroxofficial', - 'webpage_url': r're:https://www.tiktok.com/@patroxofficial/(video/)?6742501081818877190', - 'width': 540, + 'comment_count': int, } }] + def _extract_aweme(self, props_data, webpage, url): + video_info = try_get( + props_data, lambda x: x['pageProps']['itemInfo']['itemStruct'], dict) + author_info = try_get( + props_data, lambda x: x['pageProps']['itemInfo']['itemStruct']['author'], dict) or {} + stats_info = try_get(props_data, lambda x: x['pageProps']['itemInfo']['itemStruct']['stats'], dict) or {} + + user_id = str_or_none(author_info.get('uniqueId')) + download_url = try_get(video_info, (lambda x: x['video']['playAddr'], + lambda x: x['video']['downloadAddr'])) + height = try_get(video_info, lambda x: x['video']['height'], int) + width = try_get(video_info, lambda x: x['video']['width'], int) + thumbnails = [{ + 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'width': width, + 'height': height + }] + tracker = try_get(props_data, lambda x: x['initialProps']['$wid']) + + return { + 'id': str_or_none(video_info.get('id')), + 'url': download_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + 'title': video_info.get('desc') or self._og_search_title(webpage), + 'duration': try_get(video_info, lambda x: x['video']['duration'], int), + 'view_count': int_or_none(stats_info.get('playCount')), + 'like_count': int_or_none(stats_info.get('diggCount')), + 'repost_count': int_or_none(stats_info.get('shareCount')), + 'comment_count': int_or_none(stats_info.get('commentCount')), + 'timestamp': try_get(video_info, lambda x: int(x['createTime']), int), + 'creator': str_or_none(author_info.get('nickname')), + 'uploader': user_id, + 'uploader_id': str_or_none(author_info.get('id')), + 'uploader_url': f'https://www.tiktok.com/@{user_id}', + 'thumbnails': thumbnails, + 'description': str_or_none(video_info.get('desc')), + 'webpage_url': self._og_search_url(webpage), + 'http_headers': { + 'Referer': url, + 'Cookie': 'tt_webid=%s; tt_webid_v2=%s' % (tracker, tracker), + } + } + def _real_extract(self, url): video_id = self._match_id(url)