From 48be862b32648bff5b3e553e40fca4dcc6e88b28 Mon Sep 17 00:00:00 2001
From: bashonly <88596187+bashonly@users.noreply.github.com>
Date: Thu, 27 Mar 2025 17:31:01 -0500
Subject: [PATCH] [ie/youtube] Make signature and nsig extraction more robust
 (#12761)

Authored by: bashonly, seproDev

Co-authored-by: sepro <sepro@sepr0.com>
---
 test/test_youtube_signature.py     | 78 ++++++++++++++++++++++++---
 yt_dlp/extractor/youtube/_video.py | 87 ++++++++++++++++++++++--------
 2 files changed, 136 insertions(+), 29 deletions(-)

diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index c79fdc9df..0f0885366 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -88,11 +88,51 @@ _SIG_TESTS = [
         '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
         '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
     ),
+    (
+        'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+    ),
     (
         'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
         '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
         'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
     ),
+    (
+        'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0',
+    ),
 ]
 
 _NSIG_TESTS = [
@@ -252,6 +292,30 @@ _NSIG_TESTS = [
         'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
         'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
     ),
+    (
+        'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js',
+        'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js',
+        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js',
+        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js',
+        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js',
+        'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js',
+        'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE',
+    ),
 ]
 
 
@@ -302,33 +366,33 @@ def t_factory(name, sig_func, url_pattern):
         test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id'))
 
         def test_func(self):
-            basename = f'player-{name}-{test_id}.js'
+            basename = f'player-{test_id}.js'
             fn = os.path.join(self.TESTDATA_DIR, basename)
 
             if not os.path.exists(fn):
                 urllib.request.urlretrieve(url, fn)
             with open(fn, encoding='utf-8') as testf:
                 jscode = testf.read()
-            self.assertEqual(sig_func(jscode, sig_input), expected_sig)
+            self.assertEqual(sig_func(jscode, sig_input, url), expected_sig)
 
         test_func.__name__ = f'test_{name}_js_{test_id}'
         setattr(TestSignature, test_func.__name__, test_func)
     return make_tfunc
 
 
-def signature(jscode, sig_input):
-    func = YoutubeIE(FakeYDL())._parse_sig_js(jscode)
+def signature(jscode, sig_input, player_url):
+    func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url)
     src_sig = (
         str(string.printable[:sig_input])
         if isinstance(sig_input, int) else sig_input)
     return func(src_sig)
 
 
-def n_sig(jscode, sig_input):
+def n_sig(jscode, sig_input, player_url):
     ie = YoutubeIE(FakeYDL())
-    funcname = ie._extract_n_function_name(jscode)
+    funcname = ie._extract_n_function_name(jscode, player_url=player_url)
     jsi = JSInterpreter(jscode)
-    func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode))
+    func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url))
     return func([sig_input])
 
 
diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py
index e349b3651..d86bbaff8 100644
--- a/yt_dlp/extractor/youtube/_video.py
+++ b/yt_dlp/extractor/youtube/_video.py
@@ -34,6 +34,7 @@ from ...utils import (
     clean_html,
     datetime_from_str,
     filesize_from_tbr,
+    filter_dict,
     float_or_none,
     format_field,
     get_first,
@@ -1986,12 +1987,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         assert os.path.basename(func_id) == func_id
 
         self.write_debug(f'Extracting signature function {func_id}')
-        cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None
+        cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.27'), None
 
         if not cache_spec:
             code = self._load_player(video_id, player_url)
         if code:
-            res = self._parse_sig_js(code)
+            res = self._parse_sig_js(code, player_url)
             test_string = ''.join(map(chr, range(len(example_sig))))
             cache_spec = [ord(c) for c in res(test_string)]
             self.cache.store('youtube-sigfuncs', func_id, cache_spec)
@@ -2039,7 +2040,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 f'    return {expr_code}\n')
         self.to_screen('Extracted signature function:\n' + code)
 
-    def _parse_sig_js(self, jscode):
+    def _parse_sig_js(self, jscode, player_url):
         # Examples where `sig` is funcname:
         # sig=function(a){a=a.split(""); ... ;return a.join("")};
         # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a};
@@ -2063,12 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
             jscode, 'Initial JS player signature function name', group='sig')
 
+        varname, global_list = self._interpret_player_js_global_var(jscode, player_url)
         jsi = JSInterpreter(jscode)
-        global_var_map = {}
-        _, varname, value = self._extract_player_js_global_var(jscode)
-        if varname:
-            global_var_map[varname] = jsi.interpret_expression(value, {}, allow_recursion=100)
-        initial_function = jsi.extract_function(funcname, global_var_map)
+        initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list}))
         return lambda s: initial_function([s])
 
     def _cached(self, func, *cache_id):
@@ -2093,7 +2091,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         if func_code := self._player_cache.get(cache_id):
             return func_code
 
-        func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.26')
+        func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.27')
         if func_code:
             self._player_cache[cache_id] = func_code
 
@@ -2150,6 +2148,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         return ret
 
     def _extract_n_function_name(self, jscode, player_url=None):
+        varname, global_list = self._interpret_player_js_global_var(jscode, player_url)
+        if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('_w8_'), any)):
+            funcname = self._search_regex(
+                r'''(?xs)
+                    [;\n](?:
+                        (?P<f>function\s+)|
+                        (?:var\s+)?
+                    )(?P<funcname>[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*)
+                    \((?P<argname>[a-zA-Z0-9_$]+)\)\s*\{
+                    (?:(?!\}[;\n]).)+
+                    \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s*
+                    \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n]
+                ''' % (re.escape(varname), global_list.index(debug_str)),
+                jscode, 'nsig function name', group='funcname', default=None)
+            if funcname:
+                return funcname
+            self.write_debug(join_nonempty(
+                'Initial search was unable to find nsig function name',
+                player_url and f'        player = {player_url}', delim='\n'), only_once=True)
+
         # Examples (with placeholders nfunc, narray, idx):
         # *  .get("n"))&&(b=nfunc(b)
         # *  .get("n"))&&(b=narray[idx](b)
@@ -2179,7 +2197,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         if not funcname:
             self.report_warning(join_nonempty(
                 'Falling back to generic n function search',
-                player_url and f'         player = {player_url}', delim='\n'))
+                player_url and f'         player = {player_url}', delim='\n'), only_once=True)
             return self._search_regex(
                 r'''(?xs)
                 ;\s*(?P<name>[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\)
@@ -2192,9 +2210,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode,
             f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)]
 
-    def _extract_player_js_global_var(self, jscode):
+    def _extract_player_js_global_var(self, jscode, player_url):
         """Returns tuple of strings: variable assignment code, variable name, variable value code"""
-        return self._search_regex(
+        extract_global_var = self._cached(self._search_regex, 'js global array', player_url)
+        varcode, varname, varvalue = extract_global_var(
             r'''(?x)
                 (?P<q1>["\'])use\s+strict(?P=q1);\s*
                 (?P<code>
@@ -2206,17 +2225,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     )
                 )[;,]
             ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None))
+        if not varcode:
+            self.write_debug(join_nonempty(
+                'No global array variable found in player JS',
+                player_url and f'        player = {player_url}', delim='\n'), only_once=True)
+        return varcode, varname, varvalue
 
-    def _fixup_n_function_code(self, argnames, code, full_code):
-        global_var, varname, _ = self._extract_player_js_global_var(full_code)
-        if global_var:
-            self.write_debug(f'Prepending n function code with global array variable "{varname}"')
-            code = global_var + '; ' + code
+    def _interpret_player_js_global_var(self, jscode, player_url):
+        """Returns tuple of: variable name string, variable value list"""
+        _, varname, array_code = self._extract_player_js_global_var(jscode, player_url)
+        jsi = JSInterpreter(array_code)
+        interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url)
+        return varname, interpret_global_var(array_code, {}, allow_recursion=10)
+
+    def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url):
+        varcode, varname, _ = self._extract_player_js_global_var(jscode, player_url)
+        if varcode and varname:
+            nsig_code = varcode + '; ' + nsig_code
+            _, global_list = self._interpret_player_js_global_var(jscode, player_url)
         else:
-            self.write_debug('No global array variable found in player JS')
-        return argnames, re.sub(
-            rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?:(["\'])undefined\1|{varname}\[\d+\])\s*\)\s*return\s+{argnames[0]};',
-            ';', code)
+            varname = 'dlp_wins'
+            global_list = []
+
+        undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+'
+        fixed_code = re.sub(
+            rf'''(?x)
+                ;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?:
+                    (["\'])undefined\1|
+                    {re.escape(varname)}\[{undefined_idx}\]
+                )\s*\)\s*return\s+{re.escape(argnames[0])};
+            ''', ';', nsig_code)
+        if fixed_code == nsig_code:
+            self.write_debug(join_nonempty(
+                'No typeof statement found in nsig function code',
+                player_url and f'        player = {player_url}', delim='\n'), only_once=True)
+        return argnames, fixed_code
 
     def _extract_n_function_code(self, video_id, player_url):
         player_id = self._extract_player_info(player_url)
@@ -2230,7 +2273,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         func_name = self._extract_n_function_name(jscode, player_url=player_url)
 
         # XXX: Workaround for the global array variable and lack of `typeof` implementation
-        func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode)
+        func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url)
 
         return jsi, player_id, func_code