From 4b6d03ed873b04f027b4c7b734b7f8bd3c89eae6 Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Fri, 25 Oct 2019 13:35:54 +0530 Subject: [PATCH 01/10] [tiktok] fix extraction --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tiktok.py | 316 ++++++++++++++++++++--------- 2 files changed, 216 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1db21529f..abf5bb48d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1151,10 +1151,7 @@ from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, -) +from .tiktok import TikTokIE from .tinypic import TinyPicIE from .tmz import ( TMZIE, diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 66088b9ab..539fc9ecd 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,138 +1,252 @@ # coding: utf-8 from __future__ import unicode_literals +from datetime import datetime +import re from .common import InfoExtractor from ..utils import ( - compat_str, ExtractorError, int_or_none, str_or_none, - try_get, - url_or_none, + try_get ) class TikTokBaseIE(InfoExtractor): - def _extract_aweme(self, data): - video = data['video'] - description = str_or_none(try_get(data, lambda x: x['desc'])) - width = int_or_none(try_get(data, lambda x: video['width'])) - height = int_or_none(try_get(data, lambda x: video['height'])) + def _video_info(self, video_info): + return { + 'id': str_or_none(video_info.get('id')), + 'thumbnail': try_get(video_info, lambda x: x['covers'][0], str) or try_get(video_info, lambda x: x['video']['videoMeta']['cover'][0], str), + 'video_url': try_get(video_info, lambda x: x['video']['urls'][0], str) or video_info.get('video', {}).get('urls', [None])[0], + 'width': try_get(video_info, lambda x: x['video']['videoMeta']['width'], int) or try_get(video_info, lambda x: x['width'], int), + 'height': try_get(video_info, lambda x: x['video']['videoMeta']['height'], int) or try_get(video_info, lambda x: x['height'], int), + 'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int), + 'description': str_or_none(video_info.get('text')), + 'comment_count': int_or_none(video_info.get('commentCount')), + 'like_count': int_or_none(video_info.get('diggCount')), + 'repost_count': int_or_none(video_info.get('shareCount')), + 'timestamp': str_or_none(video_info.get('createTime')), + 'track_id': str_or_none(video_info.get('musicId')) + } + + def _author_info(self, author_info): + return { + 'uploader': str_or_none(author_info.get('uniqueId')), + 'creator': str_or_none(author_info.get('nickName')), + 'uploader_id': str_or_none(author_info.get('userId')), + 'channel_id': str_or_none(author_info.get('userId')) + } + + def _track_info(self, track_info): + return { + 'track': str_or_none(track_info.get('musicName')), + 'track_id': str_or_none(track_info.get('musicId')), + 'artist': str_or_none(track_info.get('authorName')) + } + + def _share_info(self, share_info): + return { + 'title': str_or_none(share_info.get('title')), + 'description': str_or_none(share_info.get('desc')), + 'image': try_get(share_info, lambda x: x['image'], dict), + 'width': try_get(share_info, lambda x: x['image']['width'], int), + 'height': try_get(share_info, lambda x: x['image']['height'], int), + } + + def _extract_aweme(self, video_data, webpage): + video_info_data = try_get( + video_data, lambda x: x['videoData']['itemInfos'], dict) + author_info_data = try_get( + video_data, lambda x: x['videoData']['authorInfos'], dict) + track_info_data = try_get( + video_data, lambda x: x['videoData']['musicInfos'], dict) + share_info_data = try_get(video_data, lambda x: x['shareMeta'], dict) + + video_info = self._video_info(video_info_data) + author_info = self._author_info(author_info_data) + track_info = self._track_info(track_info_data) + share_info = self._share_info(share_info_data) + + timestamp = int(video_info.get('timestamp')) or 0 + date = str_or_none(datetime.fromtimestamp( + timestamp).strftime('%Y%m%d')) + + thumbnails = [] + thumbnails.append({ + 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'width': video_info.get('width'), + 'height': video_info.get('height') + }) + + description = video_info.get( + 'description') or share_info.get('description') + if description is None: + tags = [] + else: + tags = re.findall(r"#(\w+)", description) - format_urls = set() formats = [] - for format_id in ( - 'play_addr_lowbr', 'play_addr', 'play_addr_h264', - 'download_addr'): - for format in try_get( - video, lambda x: x[format_id]['url_list'], list) or []: - format_url = url_or_none(format) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - }) - self._sort_formats(formats) - - thumbnail = url_or_none(try_get( - video, lambda x: x['cover']['url_list'][0], compat_str)) - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - timestamp = int_or_none(data.get('create_time')) - comment_count = int_or_none(data.get('comment_count')) or int_or_none( - try_get(data, lambda x: x['statistics']['comment_count'])) - repost_count = int_or_none(try_get( - data, lambda x: x['statistics']['share_count'])) - - aweme_id = data['aweme_id'] + formats.append({ + 'url': video_info.get('video_url') or self._og_search_video_url(webpage), + 'ext': 'mp4', + 'height': video_info.get('height'), + 'width': video_info.get('width'), + }) return { - 'id': aweme_id, - 'title': uploader or aweme_id, + 'artist': track_info.get('artist'), + 'channel_id': author_info.get('channel_id'), + 'channel_url': 'https://www.tiktok.com/@{}'.format(author_info.get('uploader')), + 'comment_count': video_info.get('comment_count'), + 'creator': author_info.get('creator'), 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'repost_count': repost_count, + 'duration': video_info.get('duration'), 'formats': formats, + 'height': video_info.get('height'), + 'id': video_info.get('id'), + 'like_count': video_info.get('like_count'), + 'playlist_title': share_info.get('title'), + 'playlist_uploader': author_info.get('uploader'), + 'playlist_uploader_id': author_info.get('uploader_id'), + 'repost_count': video_info.get('repost_count'), + 'release_date': date, + 'tags': tags, + 'thumbnail': video_info.get('thumbnail'), + 'thumbnails': thumbnails, + 'timestamp': int(video_info.get('timestamp')), + 'title': share_info.get('title') or self._og_search_title(webpage), + 'track': track_info.get('track'), + 'track_id': track_info.get('track_id'), + 'upload_date': date, + 'uploader': author_info.get('uploader'), + 'uploader_id': author_info.get('uploader_id'), + 'uploader_url': 'https://www.tiktok.com/@{}'.format(author_info.get('uploader')), + 'webpage_url': self._og_search_url(webpage), + 'width': video_info.get('width') } class TikTokIE(TikTokBaseIE): _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/v| - (?:www\.)?tiktok\.com/share/video - ) - /(?P\d+) - ''' + https?:// + (?: + (?:www|m)\. + (?:tiktok.com)\/ + (@(?P[\w\.]+))? + (?:v|video|embed|trending)?(?:\/)? + (?:video)?(?:\/)? + (?:\?shareId=)? + ) + (?P[\d]{6,}) + (?:\.html)? + (?:\?.*)? + $ + ''' + _TESTS = [{ - 'url': 'https://m.tiktok.com/v/6606727368545406213.html', - 'md5': 'd584b572e92fcd48888051f238022420', + 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', + 'md5': '34a7543afd5a151b0840ba6736fb633b', 'info_dict': { - 'id': '6606727368545406213', + 'id': '6748451240264420610', 'ext': 'mp4', - 'title': 'Zureeal', - 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', - 'thumbnail': r're:^https?://.*~noop.image', - 'uploader': 'Zureeal', - 'timestamp': 1538248586, - 'upload_date': '20180929', + 'title': 'facestoriesbyleenabh on TikTok', + 'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95', + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'uploader': 'leenabhushan', + 'timestamp': 1571246252, + 'upload_date': '20191016', 'comment_count': int, 'repost_count': int, + 'like_count': int, + 'playlist_title': 'facestoriesbyleenabh on TikTok', + 'playlist_uploader': 'leenabhushan', + 'playlist_uploader_id': '6691488002098119685', + 'artist': 'Jass Manak', + 'channel_id': '6691488002098119685', + 'channel_url': 'https://www.tiktok.com/@leenabhushan', + 'creator': 'facestoriesbyleenabh', + 'duration': 13, + 'formats': list, + 'height': 1280, + 'release_date': '20191016', + 'tags': list, + 'thumbnails': list, + 'track': 'Lehanga', + 'track_id': '6716465478027447045', + 'uploader_id': '6691488002098119685', + 'uploader_url': r're:https://www.tiktok.com/@leenabhushan', + 'webpage_url': r're:https://www.tiktok.com/@leenabhushan/(video/)?6748451240264420610', + 'width': 720, } }, { - 'url': 'https://www.tiktok.com/share/video/6606727368545406213', - 'only_matching': True, + 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', + 'md5': '06b9800d47d5fe51a19e322dd86e61c9', + 'info_dict': { + 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', + 'channel_id': '18702747', + 'channel_url': 'https://www.tiktok.com/@patroxofficial', + 'comment_count': int, + 'creator': 'patroX', + 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94', + 'duration': 27, + 'ext': 'mp4', + 'formats': list, + 'height': 960, + 'id': '6742501081818877190', + 'like_count': int, + 'playlist_title': 'patroX on TikTok', + 'playlist_uploader_id': '18702747', + 'playlist_uploader': 'patroxofficial', + 'release_date': '20190930', + 'repost_count': int, + 'tags': list, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'thumbnails': list, + 'timestamp': 1569860870, + 'title': 'patroX on TikTok', + 'track_id': '209649576000286720', + 'track': 'Big Fun', + 'upload_date': '20190930', + 'uploader_id': '18702747', + 'uploader_url': r're:https://www.tiktok.com/@patroxofficial', + 'uploader': 'patroxofficial', + 'webpage_url': r're:https://www.tiktok.com/@patroxofficial/(video/)?6742501081818877190', + 'width': 540, + } + }, { + 'url': 'https://m.tiktok.com/v/6749869095467945218.html', + 'only_matching': True + }, { + 'url': 'https://www.tiktok.com/@cchelseameow/video/6751181801206729990', + 'only_matching': True + }, { + 'url': 'https://www.tiktok.com/embed/6567659045795758085', + 'only_matching': True + }, { + 'url': 'https://www.tiktok.com/trending?shareId=6744531482393545985', + 'only_matching': True + }, { + 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610?enter_from=h5_m', + 'only_matching': True }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://m.tiktok.com/v/%s.html' % video_id, video_id) - data = self._parse_json(self._search_regex( - r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) - return self._extract_aweme(data) + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', + 'Referer': url + } + webpage = self._download_webpage(url, video_id, headers=headers, note='Downloading video webpage') + json_string = self._search_regex( + r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\">\s*(?P[^<]+)', + webpage, 'json_string', group='json_string') + json_data = self._parse_json(json_string, video_id) + video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict) -class TikTokUserIE(TikTokBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/h5/share/usr| - (?:www\.)?tiktok\.com/share/user - ) - /(?P\d+) - ''' - _TESTS = [{ - 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', - 'info_dict': { - 'id': '188294915489964032', - }, - 'playlist_mincount': 24, - }, { - 'url': 'https://www.tiktok.com/share/user/188294915489964032', - 'only_matching': True, - }] + # Chech statusCode for success + if video_data.get('statusCode') == 0: + return self._extract_aweme(video_data, webpage) + + raise ExtractorError("Video not available", video_id=video_id) - def _real_extract(self, url): - user_id = self._match_id(url) - data = self._download_json( - 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, - query={'_signature': '_'}) - entries = [] - for aweme in data['aweme_list']: - try: - entry = self._extract_aweme(aweme) - except ExtractorError: - continue - entry['extractor_key'] = TikTokIE.ie_key() - entries.append(entry) - return self.playlist_result(entries, user_id) From b901b6a08bcba71ce1efe7652def534f5c51879b Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Tue, 24 Dec 2019 11:39:43 +0530 Subject: [PATCH 02/10] [tiktok] fix json_string extraction --- youtube_dl/extractor/tiktok.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 539fc9ecd..185b66eb6 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -239,7 +239,7 @@ def _real_extract(self, url): } webpage = self._download_webpage(url, video_id, headers=headers, note='Downloading video webpage') json_string = self._search_regex( - r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\">\s*(?P[^<]+)', + r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P[^<]+)', webpage, 'json_string', group='json_string') json_data = self._parse_json(json_string, video_id) video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict) From 6fb11ca8515eb8f9dcdc1f982b3f2993932540dc Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Sat, 15 Feb 2020 12:09:53 +0530 Subject: [PATCH 03/10] [fix] refactor code --- youtube_dl/extractor/tiktok.py | 199 ++++++++------------------------- 1 file changed, 46 insertions(+), 153 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 185b66eb6..d4f6d9055 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals from datetime import datetime -import re from .common import InfoExtractor from ..utils import ( @@ -13,116 +12,55 @@ class TikTokBaseIE(InfoExtractor): - def _video_info(self, video_info): - return { - 'id': str_or_none(video_info.get('id')), - 'thumbnail': try_get(video_info, lambda x: x['covers'][0], str) or try_get(video_info, lambda x: x['video']['videoMeta']['cover'][0], str), - 'video_url': try_get(video_info, lambda x: x['video']['urls'][0], str) or video_info.get('video', {}).get('urls', [None])[0], - 'width': try_get(video_info, lambda x: x['video']['videoMeta']['width'], int) or try_get(video_info, lambda x: x['width'], int), - 'height': try_get(video_info, lambda x: x['video']['videoMeta']['height'], int) or try_get(video_info, lambda x: x['height'], int), - 'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int), - 'description': str_or_none(video_info.get('text')), - 'comment_count': int_or_none(video_info.get('commentCount')), - 'like_count': int_or_none(video_info.get('diggCount')), - 'repost_count': int_or_none(video_info.get('shareCount')), - 'timestamp': str_or_none(video_info.get('createTime')), - 'track_id': str_or_none(video_info.get('musicId')) - } - - def _author_info(self, author_info): - return { - 'uploader': str_or_none(author_info.get('uniqueId')), - 'creator': str_or_none(author_info.get('nickName')), - 'uploader_id': str_or_none(author_info.get('userId')), - 'channel_id': str_or_none(author_info.get('userId')) - } - - def _track_info(self, track_info): - return { - 'track': str_or_none(track_info.get('musicName')), - 'track_id': str_or_none(track_info.get('musicId')), - 'artist': str_or_none(track_info.get('authorName')) - } - - def _share_info(self, share_info): - return { - 'title': str_or_none(share_info.get('title')), - 'description': str_or_none(share_info.get('desc')), - 'image': try_get(share_info, lambda x: x['image'], dict), - 'width': try_get(share_info, lambda x: x['image']['width'], int), - 'height': try_get(share_info, lambda x: x['image']['height'], int), - } - def _extract_aweme(self, video_data, webpage): - video_info_data = try_get( + video_info = try_get( video_data, lambda x: x['videoData']['itemInfos'], dict) - author_info_data = try_get( + author_info = try_get( video_data, lambda x: x['videoData']['authorInfos'], dict) - track_info_data = try_get( - video_data, lambda x: x['videoData']['musicInfos'], dict) - share_info_data = try_get(video_data, lambda x: x['shareMeta'], dict) + share_info = try_get(video_data, lambda x: x['shareMeta'], dict) - video_info = self._video_info(video_info_data) - author_info = self._author_info(author_info_data) - track_info = self._track_info(track_info_data) - share_info = self._share_info(share_info_data) - - timestamp = int(video_info.get('timestamp')) or 0 - date = str_or_none(datetime.fromtimestamp( - timestamp).strftime('%Y%m%d')) + unique_id = str_or_none(author_info.get('uniqueId')) + timestamp = try_get(video_info, lambda x: int(x['createTime']), int) + date = datetime.fromtimestamp(timestamp).strftime('%Y%m%d') + height = try_get(video_info, lambda x: x['video']['videoMeta']['height'], int) + width = try_get(video_info, lambda x: x['video']['videoMeta']['width'], int) thumbnails = [] thumbnails.append({ 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage), - 'width': video_info.get('width'), - 'height': video_info.get('height') + 'width': width, + 'height': height }) - description = video_info.get( - 'description') or share_info.get('description') - if description is None: - tags = [] - else: - tags = re.findall(r"#(\w+)", description) - formats = [] formats.append({ - 'url': video_info.get('video_url') or self._og_search_video_url(webpage), + 'url': try_get(video_info, lambda x: x['video']['urls'][0], str), 'ext': 'mp4', - 'height': video_info.get('height'), - 'width': video_info.get('width'), + 'height': height, + 'width': width }) return { - 'artist': track_info.get('artist'), - 'channel_id': author_info.get('channel_id'), - 'channel_url': 'https://www.tiktok.com/@{}'.format(author_info.get('uploader')), - 'comment_count': video_info.get('comment_count'), - 'creator': author_info.get('creator'), - 'description': description, - 'duration': video_info.get('duration'), - 'formats': formats, - 'height': video_info.get('height'), - 'id': video_info.get('id'), - 'like_count': video_info.get('like_count'), - 'playlist_title': share_info.get('title'), - 'playlist_uploader': author_info.get('uploader'), - 'playlist_uploader_id': author_info.get('uploader_id'), - 'repost_count': video_info.get('repost_count'), - 'release_date': date, - 'tags': tags, - 'thumbnail': video_info.get('thumbnail'), + 'comment_count': int_or_none(video_info.get('commentCount')), + 'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int), + 'height': height, + 'id': str_or_none(video_info.get('id')), + 'like_count': int_or_none(video_info.get('diggCount')), + 'repost_count': int_or_none(video_info.get('shareCount')), + 'thumbnail': try_get(video_info, lambda x: x['covers'][0], str), + 'timestamp': timestamp, + 'width': width, + 'title': self._og_search_title(webpage), + 'creator': str_or_none(author_info.get('nickName')), + 'uploader': unique_id, + 'uploader_id': str_or_none(author_info.get('userId')), + 'uploader_url': 'https://www.tiktok.com/@' + unique_id, 'thumbnails': thumbnails, - 'timestamp': int(video_info.get('timestamp')), - 'title': share_info.get('title') or self._og_search_title(webpage), - 'track': track_info.get('track'), - 'track_id': track_info.get('track_id'), 'upload_date': date, - 'uploader': author_info.get('uploader'), - 'uploader_id': author_info.get('uploader_id'), - 'uploader_url': 'https://www.tiktok.com/@{}'.format(author_info.get('uploader')), 'webpage_url': self._og_search_url(webpage), - 'width': video_info.get('width') + 'description': str_or_none(video_info.get('text')) or str_or_none(share_info.get('desc')), + 'ext': 'mp4', + 'formats': formats } @@ -130,49 +68,34 @@ class TikTokIE(TikTokBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www|m)\. + (?:www|vm)\. (?:tiktok.com)\/ - (@(?P[\w\.]+))? - (?:v|video|embed|trending)?(?:\/)? - (?:video)?(?:\/)? - (?:\?shareId=)? + (@(?P[\w\.]+))\/ + (?:video)\/ ) (?P[\d]{6,}) - (?:\.html)? - (?:\?.*)? - $ ''' _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', 'md5': '34a7543afd5a151b0840ba6736fb633b', 'info_dict': { - 'id': '6748451240264420610', - 'ext': 'mp4', - 'title': 'facestoriesbyleenabh on TikTok', - 'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95', - 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', - 'uploader': 'leenabhushan', - 'timestamp': 1571246252, - 'upload_date': '20191016', 'comment_count': int, - 'repost_count': int, - 'like_count': int, - 'playlist_title': 'facestoriesbyleenabh on TikTok', - 'playlist_uploader': 'leenabhushan', - 'playlist_uploader_id': '6691488002098119685', - 'artist': 'Jass Manak', - 'channel_id': '6691488002098119685', - 'channel_url': 'https://www.tiktok.com/@leenabhushan', 'creator': 'facestoriesbyleenabh', + 'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95', 'duration': 13, + 'ext': 'mp4', 'formats': list, 'height': 1280, - 'release_date': '20191016', - 'tags': list, + 'id': '6748451240264420610', + 'like_count': int, + 'repost_count': int, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', 'thumbnails': list, - 'track': 'Lehanga', - 'track_id': '6716465478027447045', + 'timestamp': 1571246252, + 'title': 'facestoriesbyleenabh on TikTok', + 'upload_date': '20191016', + 'uploader': 'leenabhushan', 'uploader_id': '6691488002098119685', 'uploader_url': r're:https://www.tiktok.com/@leenabhushan', 'webpage_url': r're:https://www.tiktok.com/@leenabhushan/(video/)?6748451240264420610', @@ -182,9 +105,6 @@ class TikTokIE(TikTokBaseIE): 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', 'md5': '06b9800d47d5fe51a19e322dd86e61c9', 'info_dict': { - 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', - 'channel_id': '18702747', - 'channel_url': 'https://www.tiktok.com/@patroxofficial', 'comment_count': int, 'creator': 'patroX', 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94', @@ -194,50 +114,24 @@ class TikTokIE(TikTokBaseIE): 'height': 960, 'id': '6742501081818877190', 'like_count': int, - 'playlist_title': 'patroX on TikTok', - 'playlist_uploader_id': '18702747', - 'playlist_uploader': 'patroxofficial', - 'release_date': '20190930', 'repost_count': int, - 'tags': list, 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', 'thumbnails': list, 'timestamp': 1569860870, 'title': 'patroX on TikTok', - 'track_id': '209649576000286720', - 'track': 'Big Fun', 'upload_date': '20190930', + 'uploader': 'patroxofficial', 'uploader_id': '18702747', 'uploader_url': r're:https://www.tiktok.com/@patroxofficial', - 'uploader': 'patroxofficial', 'webpage_url': r're:https://www.tiktok.com/@patroxofficial/(video/)?6742501081818877190', 'width': 540, } - }, { - 'url': 'https://m.tiktok.com/v/6749869095467945218.html', - 'only_matching': True - }, { - 'url': 'https://www.tiktok.com/@cchelseameow/video/6751181801206729990', - 'only_matching': True - }, { - 'url': 'https://www.tiktok.com/embed/6567659045795758085', - 'only_matching': True - }, { - 'url': 'https://www.tiktok.com/trending?shareId=6744531482393545985', - 'only_matching': True - }, { - 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610?enter_from=h5_m', - 'only_matching': True }] def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', - 'Referer': url - } - webpage = self._download_webpage(url, video_id, headers=headers, note='Downloading video webpage') + webpage = self._download_webpage(url, video_id, note='Downloading video webpage') json_string = self._search_regex( r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P[^<]+)', webpage, 'json_string', group='json_string') @@ -248,5 +142,4 @@ def _real_extract(self, url): if video_data.get('statusCode') == 0: return self._extract_aweme(video_data, webpage) - raise ExtractorError("Video not available", video_id=video_id) - + raise ExtractorError('Video not available', video_id=video_id) From 6255e567d92e7d8fe7920c34db050a95b8d3ff7f Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Sat, 15 Feb 2020 12:30:12 +0530 Subject: [PATCH 04/10] [tiktok] fix regex --- youtube_dl/extractor/tiktok.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index d4f6d9055..613ac9cc8 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -65,16 +65,7 @@ def _extract_aweme(self, video_data, webpage): class TikTokIE(TikTokBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www|vm)\. - (?:tiktok.com)\/ - (@(?P[\w\.]+))\/ - (?:video)\/ - ) - (?P[\d]{6,}) - ''' + _VALID_URL = r'https?://www\.tiktok\.com/@[\w\._]+/video/(?P\d+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', @@ -133,8 +124,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id, note='Downloading video webpage') json_string = self._search_regex( - r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P[^<]+)', - webpage, 'json_string', group='json_string') + r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P[^<]+)', + webpage, 'json_string', group='json_string_ld') json_data = self._parse_json(json_string, video_id) video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict) From b19eec0d33e40ee5466669ac810f160648787dba Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Sat, 1 Aug 2020 11:57:52 +0530 Subject: [PATCH 05/10] [fix] python 2 "url" field is missing or empty `try_get` fails for `expected_type=str`, because in python 2 string has `unicode` type. This commit removes the `expected_type` to disable the comparison. --- youtube_dl/extractor/tiktok.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 613ac9cc8..c80001fe3 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -34,7 +34,7 @@ def _extract_aweme(self, video_data, webpage): formats = [] formats.append({ - 'url': try_get(video_info, lambda x: x['video']['urls'][0], str), + 'url': try_get(video_info, lambda x: x['video']['urls'][0]), 'ext': 'mp4', 'height': height, 'width': width @@ -47,7 +47,7 @@ def _extract_aweme(self, video_data, webpage): 'id': str_or_none(video_info.get('id')), 'like_count': int_or_none(video_info.get('diggCount')), 'repost_count': int_or_none(video_info.get('shareCount')), - 'thumbnail': try_get(video_info, lambda x: x['covers'][0], str), + 'thumbnail': try_get(video_info, lambda x: x['covers'][0]), 'timestamp': timestamp, 'width': width, 'title': self._og_search_title(webpage), From 89cee32ce9b504bd2b892f6fd2bcda46ae33a10c Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Tue, 11 Aug 2020 12:02:23 +0530 Subject: [PATCH 06/10] fix(tiktok): update title getter Signed-off-by: Aakash Gajjar --- youtube_dl/extractor/tiktok.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index c80001fe3..075a2cdf9 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -50,7 +50,7 @@ def _extract_aweme(self, video_data, webpage): 'thumbnail': try_get(video_info, lambda x: x['covers'][0]), 'timestamp': timestamp, 'width': width, - 'title': self._og_search_title(webpage), + 'title': str_or_none(share_info.get('title')) or self._og_search_title(webpage), 'creator': str_or_none(author_info.get('nickName')), 'uploader': unique_id, 'uploader_id': str_or_none(author_info.get('userId')), From b827ee921fe510a8730a9fab070148ed2b8279b5 Mon Sep 17 00:00:00 2001 From: Aakash Gajjar Date: Tue, 25 Aug 2020 20:23:34 +0530 Subject: [PATCH 07/10] pull changes from remote master (#190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [scrippsnetworks] Add new extractor(closes #19857)(closes #22981) * [teachable] Improve locked lessons detection (#23528) * [teachable] Fail with error message if no video URL found * [extractors] add missing import for ScrippsNetworksIE * [brightcove] cache brightcove player policy keys * [prosiebensat1] improve geo restriction handling(closes #23571) * [soundcloud] automatically update client id on failing requests * [spankbang] Fix extraction (closes #23307, closes #23423, closes #23444) * [spankbang] Improve removed video detection (#23423) * [brightcove] update policy key on failing requests * [pornhub] Fix extraction and add support for m3u8 formats (closes #22749, closes #23082) * [pornhub] Improve locked videos detection (closes #22449, closes #22780) * [brightcove] invalidate policy key cache on failing requests * [soundcloud] fix client id extraction for non fatal requests * [ChangeLog] Actualize [ci skip] * [devscripts/create-github-release] Switch to using PAT for authentication Basic authentication will be deprecated soon * release 2020.01.01 * [redtube] Detect private videos (#23518) * [vice] improve extraction(closes #23631) * [devscripts/create-github-release] Remove unused import * [wistia] improve format extraction and extract subtitles(closes #22590) * [nrktv:seriebase] Fix extraction (closes #23625) (#23537) * [discovery] fix anonymous token extraction(closes #23650) * [scrippsnetworks] add support for www.discovery.com videos * [scrippsnetworks] correct test case URL * [dctp] fix format extraction(closes #23656) * [pandatv] Remove extractor (#23630) * [naver] improve extraction - improve geo-restriction handling - extract automatic captions - extract uploader metadata - extract VLive HLS formats * [naver] improve metadata extraction * [cloudflarestream] improve extraction - add support for bytehighway.net domain - add support for signed URLs - extract thumbnail * [cloudflarestream] import embed URL extraction * [lego] fix extraction and extract subtitle(closes #23687) * [safari] Fix kaltura session extraction (closes #23679) (#23670) * [orf:fm4] Fix extraction (#23599) * [orf:radio] Clean description and improve extraction * [twitter] add support for promo_video_website cards(closes #23711) * [vodplatform] add support for embed.kwikmotion.com domain * [ndr:base:embed] Improve thumbnails extraction (closes #23731) * [canvas] Add support for new API endpoint and update tests (closes #17680, closes #18629) * [travis] Add flake8 job (#23720) * [yourporn] Fix extraction (closes #21645, closes #22255, closes #23459) * [ChangeLog] Actualize [ci skip] * release 2020.01.15 * [soundcloud] Restore previews extraction (closes #23739) * [orf:tvthek] Improve geo restricted videos detection (closes #23741) * [zype] improve extraction - extract subtitles(closes #21258) - support URLs with alternative keys/tokens(#21258) - extract more metadata * [americastestkitchen] fix extraction * [nbc] add support for nbc multi network URLs(closes #23049) * [ard] improve extraction(closes #23761) - simplify extraction - extract age limit and series - bypass geo-restriction * [ivi:compilation] Fix entries extraction (closes #23770) * [24video] Add support for 24video.vip (closes #23753) * [businessinsider] Fix jwplatform id extraction (closes #22929) (#22954) * [ard] add a missing condition * [azmedien] fix extraction(closes #23783) * [voicerepublic] fix extraction * [stretchinternet] fix extraction(closes #4319) * [youtube] Fix sigfunc name extraction (closes #23819) * [ChangeLog] Actualize [ci skip] * release 2020.01.24 * [soundcloud] imporve private playlist/set tracks extraction https://github.com/ytdl-org/youtube-dl/issues/3707#issuecomment-577873539 * [svt] fix article extraction(closes #22897)(closes #22919) * [svt] fix series extraction(closes #22297) * [viewlift] improve extraction - fix extraction(closes #23851) - add add support for authentication - add support for more domains * [vimeo] fix album extraction(closes #23864) * [tva] Relax _VALID_URL (closes #23903) * [tv5mondeplus] Fix extraction (closes #23907, closes #23911) * [twitch:stream] Lowercase channel id for stream request (closes #23917) * [sportdeutschland] Update to new sportdeutschland API They switched to SSL, but under a different host AND path... Remove the old test cases because these videos have become unavailable. * [popcorntimes] Add extractor (closes #23949) * [thisoldhouse] fix extraction(closes #23951) * [toggle] Add support for mewatch.sg (closes #23895) (#23930) * [compat] Introduce compat_realpath (refs #23991) * [update] Fix updating via symlinks (closes #23991) * [nytimes] improve format sorting(closes #24010) * [abc:iview] Support 720p (#22907) (#22921) * [nova:embed] Fix extraction (closes #23672) * [nova:embed] Improve (closes #23690) * [nova] Improve extraction (refs #23690) * [jpopsuki] Remove extractor (closes #23858) * [YoutubeDL] Fix playlist entry indexing with --playlist-items (closes #10591, closes #10622) * [test_YoutubeDL] Fix get_ids * [test_YoutubeDL] Add tests for #10591 (closes #23873) * [24video] Add support for porn.24video.net (closes #23779, closes #23784) * [npr] Add support for streams (closes #24042) * [ChangeLog] Actualize [ci skip] * release 2020.02.16 * [tv2dk:bornholm:play] Fix extraction (#24076) * [imdb] Fix extraction (closes #23443) * [wistia] Add support for multiple generic embeds (closes #8347, closes #11385) * [teachable] Add support for multiple videos per lecture (closes #24101) * [pornhd] Fix extraction (closes #24128) * [options] Remove duplicate short option -v for --version (#24162) * [extractor/common] Convert ISM manifest to unicode before processing on python 2 (#24152) * [YoutubeDL] Force redirect URL to unicode on python 2 * Remove no longer needed compat_str around geturl * [youjizz] Fix extraction (closes #24181) * [test_subtitles] Remove obsolete test * [zdf:channel] Fix tests * [zapiks] Fix test * [xtube] Fix metadata extraction (closes #21073, closes #22455) * [xtube:user] Fix test * [telecinco] Fix extraction (refs #24195) * [telecinco] Add support for article opening videos * [franceculture] Fix extraction (closes #24204) * [xhamster] Fix extraction (closes #24205) * [ChangeLog] Actualize [ci skip] * release 2020.03.01 * [vimeo] Fix subtitles URLs (#24209) * [servus] Add support for new URL schema (closes #23475, closes #23583, closes #24142) * [youtube:playlist] Fix tests (closes #23872) (#23885) * [peertube] Improve extraction * [peertube] Fix issues and improve extraction (closes #23657) * [pornhub] Improve title extraction (closes #24184) * [vimeo] fix showcase password protected video extraction(closes #24224) * [youtube] Fix age-gated videos support without login (closes #24248) * [youtube] Fix tests * [ChangeLog] Actualize [ci skip] * release 2020.03.06 * [nhk] update API version(closes #24270) * [youtube] Improve extraction in 429 error conditions (closes #24283) * [youtube] Improve age-gated videos extraction in 429 error conditions (refs #24283) * [youtube] Remove outdated code Additional get_video_info requests don't seem to provide any extra itags any longer * [README.md] Clarify 429 error * [pornhub] Add support for pornhubpremium.com (#24288) * [utils] Add support for cookies with spaces used instead of tabs * [ChangeLog] Actualize [ci skip] * release 2020.03.08 * Revert "[utils] Add support for cookies with spaces used instead of tabs" According to [1] TABs must be used as separators between fields. Files produces by some tools with spaces as separators are considered malformed. 1. https://curl.haxx.se/docs/http-cookies.html This reverts commit cff99c91d150df2a4e21962a3ca8d4ae94533b8c. * [utils] Add reference to cookie file format * Revert "[vimeo] fix showcase password protected video extraction(closes #24224)" This reverts commit 12ee431676bb655f04c7dd416a73c1f142ed368d. * [nhk] Relax _VALID_URL (#24329) * [nhk] Remove obsolete rtmp formats (closes #24329) * [nhk] Update m3u8 URL and use native hls (#24329) * [ndr] Fix extraction (closes #24326) * [xtube] Fix formats extraction (closes #24348) * [xtube] Fix typo * [hellporno] Fix extraction (closes #24399) * [cbc:watch] Add support for authentication * [cbc:watch] Fix authenticated device token caching (closes #19160) * [soundcloud] fix download url extraction(closes #24394) * [limelight] remove disabled API requests(closes #24255) * [bilibili] Add support for new URL schema with BV ids (closes #24439, closes #24442) * [bilibili] Add support for player.bilibili.com (closes #24402) * [teachable] Extract chapter metadata (closes #24421) * [generic] Look for teachable embeds before wistia * [teachable] Update upskillcourses domain New version does not use teachable platform any longer * [teachable] Update gns3 domain * [teachable] Update test * [ChangeLog] Actualize [ci skip] * [ChangeLog] Actualize [ci skip] * release 2020.03.24 * [spankwire] Fix extraction (closes #18924, closes #20648) * [spankwire] Add support for generic embeds (refs #24633) * [youporn] Add support form generic embeds * [mofosex] Add support for generic embeds (closes #24633) * [tele5] Fix extraction (closes #24553) * [extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats (#24667) * [tv4] Fix ISM formats extraction (closes #24667) * [twitch:clips] Extend _VALID_URL (closes #24290) (#24642) * [motherless] Fix extraction (closes #24699) * [nova:embed] Fix extraction (closes #24700) * [youtube] Skip broken multifeed videos (closes #24711) * [soundcloud] Extract AAC format * [soundcloud] Improve AAC format extraction (closes #19173, closes #24708) * [thisoldhouse] Fix video id extraction (closes #24548) Added support for: with of without "www." and either ".chorus.build" or ".com" It now validated correctly on older URL's ```