From e40c758c2a8f049cd254ad73f0e6ade00bd004d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 May 2020 07:18:08 +0700 Subject: [PATCH] [youtube] Improve player id extraction and add tests --- test/test_youtube_signature.py | 22 +++++++++++++++++++ youtube_dl/extractor/youtube.py | 38 +++++++++++++++------------------ 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index f0c370eeed..69df30edaa 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -74,6 +74,28 @@ ] +class TestPlayerInfo(unittest.TestCase): + def test_youtube_extract_player_info(self): + PLAYER_URLS = ( + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + # obsolete + ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), + ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), + ('https://www.youtube.com/yts/jsbin/player_ias-vflCPQUIL/en_US/base.js', 'vflCPQUIL'), + ('https://www.youtube.com/yts/jsbin/player-vflzQZbt7/en_US/base.js', 'vflzQZbt7'), + ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), + ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), + ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), + ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), + ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), + ) + for player_url, expected_player_id in PLAYER_URLS: + expected_player_type = player_url.split('.')[-1] + player_type, player_id = YoutubeIE._extract_player_info(player_url) + self.assertEqual(player_type, expected_player_type) + self.assertEqual(player_id, expected_player_id) + + class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28886cff2b..5ea66c9620 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -426,6 +426,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' + _PLAYER_INFO_RE = ( + r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + ) _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1273,14 +1277,18 @@ def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?[-.](?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', - player_url) - if not id_m: + @classmethod + def _extract_player_info(cls, player_url): + for player_re in cls._PLAYER_INFO_RE: + id_m = re.search(player_re, player_url) + if id_m: + break + else: raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') + return id_m.group('ext'), id_m.group('id') + + def _extract_signature_function(self, video_id, player_url, example_sig): + player_type, player_id = self._extract_player_info(player_url) # Read from filesystem cache func_id = '%s_%s_%s' % ( @@ -2009,22 +2017,10 @@ def _extract_filesize(media_url): if self._downloader.params.get('verbose'): if player_url is None: - player_version = 'unknown' player_desc = 'unknown' else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - + player_type, player_version = self._extract_player_info(player_url) + player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc))