From b801cd7179c9546f4054dc534ec4b713e09976a7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 20 May 2022 06:01:08 +0530 Subject: [PATCH] [tiktok] Detect embeds Closes #3799 --- yt_dlp/extractor/generic.py | 6 ++++++ yt_dlp/extractor/tiktok.py | 28 ++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index b0fc176ef..c7e9ea059 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -74,6 +74,7 @@ from .ted import TedEmbedIE from .theplatform import ThePlatformIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tnaflix import TNAFlixNetworkEmbedIE from .tube8 import Tube8IE from .tunein import TuneInBaseIE @@ -3756,6 +3757,11 @@ def _real_extract(self, url): if ruutu_urls: return self.playlist_from_matches(ruutu_urls, video_id, video_title) + # Look for Tiktok embeds + tiktok_urls = TikTokIE._extract_urls(webpage) + if tiktok_urls: + return self.playlist_from_matches(tiktok_urls, video_id, video_title) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 4ba993582..4926096c0 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1,28 +1,26 @@ import itertools +import json import random +import re import string import time -import json from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse -) +from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse from ..utils import ( ExtractorError, HEADRequest, + LazyList, UnsupportedError, get_first, int_or_none, join_nonempty, - LazyList, + qualities, srt_subtitles_timecode, str_or_none, traverse_obj, try_get, url_or_none, - qualities, ) @@ -36,6 +34,10 @@ class TikTokBaseIE(InfoExtractor): _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + @staticmethod + def _create_url(user_id, video_id): + return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' + def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) @@ -361,7 +363,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url): class TikTokIE(TikTokBaseIE): - _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P\d+)' + _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P[\w\.-]+)/video)/(?P\d+)' _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', @@ -466,7 +468,7 @@ class TikTokIE(TikTokBaseIE): 'info_dict': { 'id': '7059698374567611694', 'ext': 'mp4', - 'title': 'tiktok video #7059698374567611694', + 'title': 'TikTok video #7059698374567611694', 'description': '', 'uploader': 'pokemonlife22', 'creator': 'Pokemon', @@ -490,6 +492,11 @@ class TikTokIE(TikTokBaseIE): 'only_matching': True }] + @classmethod + def _extract_urls(cls, webpage): + return [mobj.group('url') for mobj in re.finditer( + rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P{cls._VALID_URL})', webpage)] + def _extract_aweme_app(self, aweme_id): try: aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, @@ -506,7 +513,8 @@ def _extract_aweme_app(self, aweme_id): return self._parse_aweme_video_app(aweme_detail) def _real_extract(self, url): - video_id = self._match_id(url) + video_id, user_id = self._match_valid_url(url).group('id', 'user_id') + url = self._create_url(user_id, video_id) try: return self._extract_aweme_app(video_id)