From e0df6211cc9364f62406b2907fa830847324db53 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 14:19:30 +0200 Subject: [PATCH 01/21] Restore accidentally deleted commits That's what happens if you let Windows machines write :( --- .gitignore | 1 + test/test_youtube_signature.py | 80 +++++ youtube_dl/extractor/youtube.py | 611 ++++++++++++++++++++++++++++++-- youtube_dl/utils.py | 6 + 4 files changed, 677 insertions(+), 21 deletions(-) create mode 100644 test/test_youtube_signature.py diff --git a/.gitignore b/.gitignore index 61cb6bc3c..24fdb3626 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ updates_key.pem *.flv *.mp4 *.part +test/testdata diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py new file mode 100644 index 000000000..2c06caef4 --- /dev/null +++ b/test/test_youtube_signature.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +import io +import re +import string +import sys +import unittest + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import YoutubeIE +from youtube_dl.utils import compat_str, compat_urlretrieve + +_TESTS = [ + ( + u'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', + u'js', + 86, + u'>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', + ), + ( + u'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', + u'js', + 85, + u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', + ), + ( + u'https://s.ytimg.com/yts/swfbin/watch_as3-vflg5GhxU.swf', + u'swf', + 82, + u'23456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!?#$%&\'()*+,-./:;<=>"' + ), +] + + +class TestSignature(unittest.TestCase): + def setUp(self): + TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') + if not os.path.exists(self.TESTDATA_DIR): + os.mkdir(self.TESTDATA_DIR) + + +def make_testfunc(url, stype, sig_length, expected_sig): + basename = url.rpartition('/')[2] + m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) + assert m, '%r should follow URL format' % basename + test_id = m.group(1) + + def test_func(self): + fn = os.path.join(self.TESTDATA_DIR, basename) + + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + + ie = YoutubeIE() + if stype == 'js': + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) + else: + assert stype == 'swf' + with open(fn, 'rb') as testf: + swfcode = testf.read() + func = ie._parse_sig_swf(swfcode) + src_sig = compat_str(string.printable[:sig_length]) + got_sig = func(src_sig) + self.assertEqual(got_sig, expected_sig) + + test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + setattr(TestSignature, test_func.__name__, test_func) + +for test_spec in _TESTS: + make_testfunc(*test_spec) + + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 47d5cb7ff..456d3cb0f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,11 +1,16 @@ # coding: utf-8 +import collections +import itertools +import io import json import netrc import re import socket -import itertools -import xml.etree.ElementTree +import string +import struct +import traceback +import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor @@ -393,6 +398,10 @@ def suitable(cls, url): if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._jsplayer_cache = {} + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -413,15 +422,565 @@ def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _decrypt_signature(self, s): + def _extract_signature_function(self, video_id, player_url): + id_m = re.match(r'.*-(?P[^.]+)\.(?P[^.]+)$', player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + return self._parse_sig_js(code) + elif player_tpye == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + code = urlh.read() + return self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExctractorError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise ExtractorError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise ExtractorError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + _, pos = u30(pos=pos) # Slot id + type_name_idx, pos = u30(pos=pos) + vindex, pos = u30(pos=pos) + if vindex != 0: + _, pos = read_byte(pos=pos) # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + _, pos = u30(pos=pos) # disp_id + method_idx, pos = u30(pos=pos) + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + _, pos = u30(pos=pos) # slot_id + _, pos = u30(pos=pos) # classi + elif kind == 0x05: # Function + _, pos = u30(pos=pos) # slot_id + function_idx, pos = u30(pos=pos) + methods[function_idx] = multinames[trait_name_idx] + else: + raise ExtractorError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count, pos = u30(pos=pos) + for _c3 in range(metadata_count): + _, pos = u30(pos=pos) + + return (methods, pos) + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count, p = u30() + for class_id in range(class_count): + name_idx, p = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + _, p = u30() # super_name idx + flags, p = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + protected_ns_idx, p = u30() + intrf_count, p = u30() + for _c2 in range(intrf_count): + _, p = u30() + _, p = u30() # iinit + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + if searched_class_id is None: + raise ExtractorError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + _, p = u30() # cinit + trait_count, p = u30() + for _c2 in range(trait_count): + trait_methods, p = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count, p = u30() + for _c in range(script_count): + _, p = u30() # init + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + # Method bodies + method_body_count, p = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx, p = u30() + max_stack, p = u30() + local_count, p = u30() + init_scope_depth, p = u30() + max_scope_depth, p = u30() + code_length, p = u30() + if method_idx in method_idxs: + m = Method(code_tag[p:p+code_length], local_count) + methods[method_idxs[method_idx]] = m + p += code_length + exception_count, p = u30() + for _c2 in range(exception_count): + _, p = u30() # from + _, p = u30() # to + _, p = u30() # target + _, p = u30() # exc_type + _, p = u30() # var_name + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + assert p == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise ExtractorError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + print('Entering function %s(%r)' % (func_name, args)) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + _ = u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if len(s) == 93: - return s[86:29:-1] + s[88] + s[28:5:-1] - elif len(s) == 92: + if jsplayer_url is not None: + try: + if jsplayer_url not in self._jsplayer_cache: + self._jsplayer_cache[jsplayer_url] = self._extract_signature_function( + video_id, jsplayer_url + ) + return self._jsplayer_cache[jsplayer_url]([s]) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb) + + self._downloader.report_warning(u'Warning: Falling back to static signature algorithm') + + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + + if len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] - elif len(s) == 91: - return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: @@ -631,7 +1190,7 @@ def _real_extract(self, url): video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: @@ -784,21 +1343,31 @@ def _real_extract(self, url): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: - if self._downloader.params.get('verbose'): - s = url_data['s'][0] - if age_gate: - player = 'flash player' - else: - player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, - 'html5 player', fatal=False) - parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) - self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(s), parts_sizes, url_data['itag'][0], player)) encrypted_sig = url_data['s'][0] + if self._downloader.params.get('verbose'): + if age_gate: + player_version = self._search_regex(r'-(.+)\.swf$', + player_url if player_url else 'NOT FOUND', + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage, + 'html5 player', fatal=False) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) + self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + if age_gate: - signature = self._decrypt_signature_age_gate(encrypted_sig) + jsplayer_url = None else: - signature = self._decrypt_signature(encrypted_sig) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + jsplayer_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 814a9b6be..201ed255d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -66,6 +66,12 @@ except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL From a7177865b19cdf711f15e01541aee9deae97a56c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 14:48:12 +0200 Subject: [PATCH 02/21] Implement more opcodes --- youtube_dl/extractor/youtube.py | 45 ++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 456d3cb0f..b57693ee6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -863,13 +863,7 @@ def resfunc(args): coder = io.BytesIO(m.code) while True: opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 36: # pushbyte + if opcode == 36: # pushbyte v = struct.unpack('!B', coder.read(1))[0] stack.append(v) elif opcode == 44: # pushstring @@ -895,12 +889,41 @@ def resfunc(args): else: res = obj.split(args[0]) stack.append(res) + elif mname == u'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == u'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) elif mname in method_pyfunctions: stack.append(method_pyfunctions[mname](args)) else: raise NotImplementedError( u'Unsupported property %r on %r' % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 79: # callpropvoid + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + u'Unsupported (void) property %r on %r' + % (mname, obj)) elif opcode == 93: # findpropstrict index = u30(coder) mname = multinames[index] @@ -943,6 +966,14 @@ def resfunc(args): value1 = stack.pop() res = value1 % value2 stack.append(res) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) elif opcode == 214: # setlocal_2 registers[2] = stack.pop() elif opcode == 215: # setlocal_3 From 95dbd2f9907416e86424e4372dbd2593c1699e7d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:10:38 +0200 Subject: [PATCH 03/21] Change test target (Verified with node.js) --- test/test_youtube_signature.py | 2 +- youtube_dl/extractor/youtube.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 2c06caef4..36533cf1f 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -30,7 +30,7 @@ u'https://s.ytimg.com/yts/swfbin/watch_as3-vflg5GhxU.swf', u'swf', 82, - u'23456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!?#$%&\'()*+,-./:;<=>"' + u':/.-,+*)=\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBAzyxw>utsrqponmlkjihgfedcba987654321' ), ] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b57693ee6..45b593a12 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -857,7 +857,6 @@ def extract_function(func_name): m = methods[func_name] def resfunc(args): - print('Entering function %s(%r)' % (func_name, args)) registers = ['(this)'] + list(args) + [None] * m.local_count stack = [] coder = io.BytesIO(m.code) From 8379969834b787708ef5574dc447028c1caf295b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:19:48 +0200 Subject: [PATCH 04/21] Prepare signature function caching --- youtube_dl/extractor/youtube.py | 57 ++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 45b593a12..2cd2fdce3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -400,7 +400,7 @@ def suitable(cls, url): def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) - self._jsplayer_cache = {} + self._player_cache = {} def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" @@ -423,26 +423,33 @@ def report_rtmp_download(self): self.to_screen(u'RTMP download detected') def _extract_signature_function(self, video_id, player_url): - id_m = re.match(r'.*-(?P[^.]+)\.(?P[^.]+)$', player_url) + id_m = re.match(r'.*-(?P[a-zA-Z0-9]+)\.(?P[a-z]+)$', + player_url) player_type = id_m.group('ext') player_id = id_m.group('id') + # TODO read from filesystem cache + if player_type == 'js': code = self._download_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, jsplayer_id), + note=u'Downloading %s player %s' % (player_type, player_id), errnote=u'Download of %s failed' % player_url) - return self._parse_sig_js(code) + res = self._parse_sig_js(code) elif player_tpye == 'swf': urlh = self._request_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, jsplayer_id), + note=u'Downloading %s player %s' % (player_type, player_id), errnote=u'Download of %s failed' % player_url) code = urlh.read() - return self._parse_sig_swf(code) + res = self._parse_sig_swf(code) else: assert False, 'Invalid player type %r' % player_type + # TODO write cache + + return res + def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, @@ -987,22 +994,27 @@ def resfunc(args): initial_function = extract_function(u'decipher') return lambda s: initial_function([s]) - def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False): + def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if jsplayer_url is not None: + if player_url is not None: try: - if jsplayer_url not in self._jsplayer_cache: - self._jsplayer_cache[jsplayer_url] = self._extract_signature_function( - video_id, jsplayer_url + if player_url not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url ) - return self._jsplayer_cache[jsplayer_url]([s]) + self._player_cache[player_url] = func + return self._player_cache[player_url](s) except Exception as e: tb = traceback.format_exc() - self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb) + self._downloader.report_warning( + u'Automatic signature extraction failed: ' + tb) - self._downloader.report_warning(u'Warning: Falling back to static signature algorithm') + self._downloader.report_warning( + u'Warning: Falling back to static signature algorithm') + return self._static_decrypt_signature(s) + def _static_decrypt_signature(self, s): if age_gate: # The videos with age protection use another player, so the # algorithms can be different. @@ -1376,12 +1388,14 @@ def _real_extract(self, url): encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): if age_gate: - player_version = self._search_regex(r'-(.+)\.swf$', - player_url if player_url else 'NOT FOUND', + player_version = self._search_regex( + r'-(.+)\.swf$', + player_url if player_url else None, 'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: - player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage, + player_version = self._search_regex( + r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) player_desc = u'html5 player %s' % player_version @@ -1389,15 +1403,14 @@ def _real_extract(self, url): self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) - if age_gate: - jsplayer_url = None - else: + if not age_gate: jsplayer_url_json = self._search_regex( r'"assets":.+?"js":\s*("[^"]+")', video_webpage, u'JS player URL') - jsplayer_url = json.loads(jsplayer_url_json) + player_url = json.loads(jsplayer_url_json) - signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate) + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' From ba552f542f674d35de21d48978f211b8db3f0ff8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:32:37 +0200 Subject: [PATCH 05/21] Use reader instead of indexing --- youtube_dl/extractor/youtube.py | 254 +++++++++++++++----------------- 1 file changed, 118 insertions(+), 136 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2cd2fdce3..09bd423f5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -590,99 +590,83 @@ def extract_tags(content): for tag_code, tag in extract_tags(content) if tag_code == 82) p = code_tag.index(b'\0', 4) + 1 + code_reader = io.BytesIO(code_tag[p:]) # Parse ABC (AVM2 ByteCode) - def read_int(data=None, pos=None): - if hasattr(data, 'read'): - assert pos is None - - res = 0 - shift = 0 - for _ in range(5): - buf = data.read(1) - assert len(buf) == 1 - b = struct.unpack('> 4 methods = {} if kind in [0x00, 0x06]: # Slot or Const - _, pos = u30(pos=pos) # Slot id - type_name_idx, pos = u30(pos=pos) - vindex, pos = u30(pos=pos) + _ = u30() # Slot id + type_name_idx = u30() + vindex = u30() if vindex != 0: - _, pos = read_byte(pos=pos) # vkind + _ = read_byte() # vkind elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - _, pos = u30(pos=pos) # disp_id - method_idx, pos = u30(pos=pos) + _ = u30() # disp_id + method_idx = u30() methods[multinames[trait_name_idx]] = method_idx elif kind == 0x04: # Class - _, pos = u30(pos=pos) # slot_id - _, pos = u30(pos=pos) # classi + _ = u30() # slot_id + _ = u30() # classi elif kind == 0x05: # Function - _, pos = u30(pos=pos) # slot_id - function_idx, pos = u30(pos=pos) + _ = u30() # slot_id + function_idx = u30() methods[function_idx] = multinames[trait_name_idx] else: raise ExtractorError(u'Unsupported trait kind %d' % kind) if attrs & 0x4 != 0: # Metadata present - metadata_count, pos = u30(pos=pos) + metadata_count = u30() for _c3 in range(metadata_count): - _, pos = u30(pos=pos) + _ = u30() - return (methods, pos) + return methods # Classes TARGET_CLASSNAME = u'SignatureDecipher' searched_idx = multinames.index(TARGET_CLASSNAME) searched_class_id = None - class_count, p = u30() + class_count = u30() for class_id in range(class_count): - name_idx, p = u30() + name_idx = u30() if name_idx == searched_idx: # We found the class we're looking for! searched_class_id = class_id - _, p = u30() # super_name idx - flags, p = read_byte() + _ = u30() # super_name idx + flags = read_byte() if flags & 0x08 != 0: # Protected namespace is present - protected_ns_idx, p = u30() - intrf_count, p = u30() + protected_ns_idx = u30() + intrf_count = u30() for _c2 in range(intrf_count): - _, p = u30() - _, p = u30() # iinit - trait_count, p = u30() + _ = u30() + _ = u30() # iinit + trait_count = u30() for _c2 in range(trait_count): - _, p = parse_traits_info() + _ = parse_traits_info() if searched_class_id is None: raise ExtractorError(u'Target class %r not found' % @@ -807,10 +789,10 @@ def parse_traits_info(pos=None): method_names = {} method_idxs = {} for class_id in range(class_count): - _, p = u30() # cinit - trait_count, p = u30() + _ = u30() # cinit + trait_count = u30() for _c2 in range(trait_count): - trait_methods, p = parse_traits_info() + trait_methods = parse_traits_info() if class_id == searched_class_id: method_names.update(trait_methods.items()) method_idxs.update(dict( @@ -818,40 +800,40 @@ def parse_traits_info(pos=None): for name, idx in trait_methods.items())) # Scripts - script_count, p = u30() + script_count = u30() for _c in range(script_count): - _, p = u30() # init - trait_count, p = u30() + _ = u30() # init + trait_count = u30() for _c2 in range(trait_count): - _, p = parse_traits_info() + _ = parse_traits_info() # Method bodies - method_body_count, p = u30() + method_body_count = u30() Method = collections.namedtuple('Method', ['code', 'local_count']) methods = {} for _c in range(method_body_count): - method_idx, p = u30() - max_stack, p = u30() - local_count, p = u30() - init_scope_depth, p = u30() - max_scope_depth, p = u30() - code_length, p = u30() + method_idx = u30() + max_stack = u30() + local_count = u30() + init_scope_depth = u30() + max_scope_depth = u30() + code_length = u30() + code = read_bytes(code_length) if method_idx in method_idxs: - m = Method(code_tag[p:p+code_length], local_count) + m = Method(code, local_count) methods[method_idxs[method_idx]] = m - p += code_length - exception_count, p = u30() + exception_count = u30() for _c2 in range(exception_count): - _, p = u30() # from - _, p = u30() # to - _, p = u30() # target - _, p = u30() # exc_type - _, p = u30() # var_name - trait_count, p = u30() + _ = u30() # from + _ = u30() # to + _ = u30() # target + _ = u30() # exc_type + _ = u30() # var_name + trait_count = u30() for _c2 in range(trait_count): - _, p = parse_traits_info() + _ = parse_traits_info() - assert p == len(code_tag) + assert p + code_reader.tell() == len(code_tag) assert len(methods) == len(method_idxs) method_pyfunctions = {} From 2f2ffea9cad7d30165a0171bf6e662bef2182ab4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Sep 2013 15:34:29 +0200 Subject: [PATCH 06/21] Clarify a couple of calls --- youtube_dl/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 09bd423f5..5c0ea2e43 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -641,7 +641,7 @@ def read_byte(reader=None): return res # minor_version + major_version - _ = read_bytes(4) + _ = read_bytes(2 + 2) # Constant pool int_count = u30() @@ -994,9 +994,10 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): self._downloader.report_warning( u'Warning: Falling back to static signature algorithm') - return self._static_decrypt_signature(s) + return self._static_decrypt_signature( + s, video_id, player_url, age_gate) - def _static_decrypt_signature(self, s): + def _static_decrypt_signature(self, s, video_id, player_url, age_gate): if age_gate: # The videos with age protection use another player, so the # algorithms can be different. From c4417ddb611e14b81fe56b6b32964c5802faf554 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 00:35:03 +0200 Subject: [PATCH 07/21] [youtube] Add filesystem signature cache --- youtube_dl/FileDownloader.py | 2 ++ youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 0b5a5d77d..1eb71a80e 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -39,6 +39,8 @@ class FileDownloader(object): test: Download only first bytes to test the downloader. min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size + cachedir: Location of the cache files in the filesystem. + False to disable filesystem cache. """ params = None diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5c0ea2e43..63f59ae8f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -4,8 +4,10 @@ import itertools import io import json -import netrc +import operator +import os.path import re +import shutil import socket import string import struct @@ -422,13 +424,28 @@ def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _extract_signature_function(self, video_id, player_url): - id_m = re.match(r'.*-(?P[a-zA-Z0-9]+)\.(?P[a-z]+)$', + def _extract_signature_function(self, video_id, player_url, slen): + id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)\.(?P[a-z]+)$', player_url) player_type = id_m.group('ext') player_id = id_m.group('id') - # TODO read from filesystem cache + # Read from filesystem cache + func_id = '%s_%s_%d' % (player_type, player_id, slen) + assert os.path.basename(func_id) == func_id + cache_dir = self.downloader.params.get('cachedir', + u'~/.youtube-dl/cache') + + if cache_dir is not False: + cache_fn = os.path.join(os.path.expanduser(cache_dir), + u'youtube-sigfuncs', + func_id + '.json') + try: + with io.open(cache_fn, '', encoding='utf-8') as cachef: + cache_spec = json.load(cachef) + return lambda s: u''.join(s[i] for i in cache_spec) + except OSError: + pass # No cache available if player_type == 'js': code = self._download_webpage( @@ -436,7 +453,7 @@ def _extract_signature_function(self, video_id, player_url): note=u'Downloading %s player %s' % (player_type, player_id), errnote=u'Download of %s failed' % player_url) res = self._parse_sig_js(code) - elif player_tpye == 'swf': + elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, note=u'Downloading %s player %s' % (player_type, player_id), @@ -446,7 +463,11 @@ def _extract_signature_function(self, video_id, player_url): else: assert False, 'Invalid player type %r' % player_type - # TODO write cache + if cache_dir is not False: + cache_res = res(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + shutil.makedirs(os.path.dirname(cache_fn)) + write_json_file(cache_spec, cache_fn) return res @@ -983,7 +1004,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): try: if player_url not in self._player_cache: func = self._extract_signature_function( - video_id, player_url + video_id, player_url, len(s) ) self._player_cache[player_url] = func return self._player_cache[player_url](s) From edf3e38ebd6c5db21585dc7b6384e325e6cfb540 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:30:02 +0200 Subject: [PATCH 08/21] [youtube] Improve cache and add an option to print the extracted signatures --- youtube_dl/FileDownloader.py | 2 +- youtube_dl/__init__.py | 6 +++ youtube_dl/extractor/youtube.py | 69 +++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 1eb71a80e..604714134 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -40,7 +40,7 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size cachedir: Location of the cache files in the filesystem. - False to disable filesystem cache. + "NONE" to disable filesystem cache. """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1ed30aae3..072f69f2e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -167,6 +167,7 @@ def _hide_login_info(opts): help='Output descriptions of all supported extractors', default=False) general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') + general.add_option('--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', help='Location in the filesystem where youtube-dl can store downloaded information permanently. NONE to disable filesystem caching, %default by default') selection.add_option('--playlist-start', @@ -272,6 +273,10 @@ def _hide_login_info(opts): verbosity.add_option('--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems(very verbose)') + verbosity.add_option('--youtube-print-sig-code', + action='store_true', dest='youtube_print_sig_code', default=False, + help=optparse.SUPPRESS_HELP) + filesystem.add_option('-t', '--title', action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -613,6 +618,7 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, + 'youtube_print_sig_code': opts.youtube_print_sig_code }) if opts.verbose: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 63f59ae8f..4200f987e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,13 +1,13 @@ # coding: utf-8 import collections +import errno import itertools import io import json import operator import os.path import re -import shutil import socket import string import struct @@ -17,6 +17,7 @@ from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_chr, compat_http_client, compat_parse_qs, compat_urllib_error, @@ -30,6 +31,7 @@ unescapeHTML, unified_strdate, orderedSet, + write_json_file, ) class YoutubeBaseInfoExtractor(InfoExtractor): @@ -433,18 +435,18 @@ def _extract_signature_function(self, video_id, player_url, slen): # Read from filesystem cache func_id = '%s_%s_%d' % (player_type, player_id, slen) assert os.path.basename(func_id) == func_id - cache_dir = self.downloader.params.get('cachedir', - u'~/.youtube-dl/cache') + cache_dir = self._downloader.params.get('cachedir', + u'~/.youtube-dl/cache') - if cache_dir is not False: + if cache_dir != u'NONE': cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', func_id + '.json') try: - with io.open(cache_fn, '', encoding='utf-8') as cachef: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: cache_spec = json.load(cachef) return lambda s: u''.join(s[i] for i in cache_spec) - except OSError: + except IOError: pass # No cache available if player_type == 'js': @@ -464,13 +466,55 @@ def _extract_signature_function(self, video_id, player_url, slen): assert False, 'Invalid player type %r' % player_type if cache_dir is not False: - cache_res = res(map(compat_chr, range(slen))) - cache_spec = [ord(c) for c in cache_res] - shutil.makedirs(os.path.dirname(cache_fn)) - write_json_file(cache_spec, cache_fn) + try: + cache_res = res(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + try: + os.makedirs(os.path.dirname(cache_fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + write_json_file(cache_spec, cache_fn) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Writing cache to %r failed: %s' % (cache_fn, tb)) return res + def _print_sig_code(self, func, slen): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = u'' if start == 0 else str(start) + ends = u':%d' % (end+step) + steps = u'' if step == 1 else (':%d' % step) + return u's[%s%s%s]' % (starts, ends, steps) + + step = None + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield u's[%d]' % prev + if step is None: + yield u's[%d]' % i + else: + yield _genslice(start, i, step) + + cache_res = func(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + expr_code = u' + '.join(gen_sig_code(cache_spec)) + code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) + self.to_screen(u'Extracted signature:\n' + code) + def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, @@ -1007,7 +1051,10 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): video_id, player_url, len(s) ) self._player_cache[player_url] = func - return self._player_cache[player_url](s) + func = self._player_cache[player_url] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, len(s)) + return func(s) except Exception as e: tb = traceback.format_exc() self._downloader.report_warning( From 4ba146f35dd797e9d78636cb3cffabb100575240 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:31:25 +0200 Subject: [PATCH 09/21] Update static signatures --- youtube_dl/extractor/youtube.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4200f987e..8245349b2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1072,8 +1072,12 @@ def _static_decrypt_signature(self, s, video_id, player_url, age_gate): if len(s) == 86: return s[2:63] + s[82] + s[64:82] + s[63] - if len(s) == 92: + if len(s) == 93: + return s[86:29:-1] + s[88] + s[28:5:-1] + elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 91: + return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: From 0ca96d48c7f74e122be70b71bb5fe38f4b143cb0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:37:23 +0200 Subject: [PATCH 10/21] [youtube] Improve source code quality --- youtube_dl/extractor/youtube.py | 104 ++++++++++++++++---------------- 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8245349b2..a9bfc455f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,16 +2,16 @@ import collections import errno -import itertools import io +import itertools import json -import operator import os.path import re import socket import string import struct import traceback +import xml.etree.ElementTree import zlib from .common import InfoExtractor, SearchInfoExtractor @@ -475,7 +475,7 @@ def _extract_signature_function(self, video_id, player_url, slen): if ose.errno != errno.EEXIST: raise write_json_file(cache_spec, cache_fn) - except Exception as e: + except Exception: tb = traceback.format_exc() self._downloader.report_warning( u'Writing cache to %r failed: %s' % (cache_fn, tb)) @@ -491,6 +491,8 @@ def _genslice(start, end, step): return u's[%s%s%s]' % (starts, ends, steps) step = None + start = '(Never used)' # Quelch pyflakes warnings - start will be + # set as soon as step is set for i, prev in zip(idxs[1:], idxs[:-1]): if step is not None: if i - prev == step: @@ -527,7 +529,7 @@ def argidx(varname): def interpret_statement(stmt, local_vars, allow_recursion=20): if allow_recursion < 0: - raise ExctractorError(u'Recursion limit reached') + raise ExtractorError(u'Recursion limit reached') if stmt.startswith(u'var '): stmt = stmt[len(u'var '):] @@ -685,7 +687,7 @@ def s32(reader=None): v = - ((v ^ 0xffffffff) + 1) return v - def string(reader=None): + def read_string(reader=None): if reader is None: reader = code_reader slen = u30(reader) @@ -706,31 +708,31 @@ def read_byte(reader=None): return res # minor_version + major_version - _ = read_bytes(2 + 2) + read_bytes(2 + 2) # Constant pool int_count = u30() for _c in range(1, int_count): - _ = s32() + s32() uint_count = u30() for _c in range(1, uint_count): - _ = u32() + u32() double_count = u30() - _ = read_bytes((double_count-1) * 8) + read_bytes((double_count-1) * 8) string_count = u30() constant_strings = [u''] for _c in range(1, string_count): - s = string() + s = read_string() constant_strings.append(s) namespace_count = u30() for _c in range(1, namespace_count): - _ = read_bytes(1) # kind - _ = u30() # name + read_bytes(1) # kind + u30() # name ns_set_count = u30() for _c in range(1, ns_set_count): count = u30() for _c2 in range(count): - _ = u30() + u30() multiname_count = u30() MULTINAME_SIZES = { 0x07: 2, # QName @@ -749,13 +751,13 @@ def read_byte(reader=None): kind = u30() assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind if kind == 0x07: - namespace_idx = u30() + u30() # namespace_idx name_idx = u30() multinames.append(constant_strings[name_idx]) else: multinames.append('[MULTINAME kind: %d]' % kind) for _c2 in range(MULTINAME_SIZES[kind]): - _ = u30() + u30() # Methods method_count = u30() @@ -765,32 +767,32 @@ def read_byte(reader=None): method_infos = [] for method_id in range(method_count): param_count = u30() - _ = u30() # return type + u30() # return type for _ in range(param_count): - _ = u30() # param type - _ = u30() # name index (always 0 for youtube) + u30() # param type + u30() # name index (always 0 for youtube) flags = read_byte() if flags & 0x08 != 0: # Options present option_count = u30() for c in range(option_count): - _ = u30() # val - _ = read_bytes(1) # kind + u30() # val + read_bytes(1) # kind if flags & 0x80 != 0: # Param names present for _ in range(param_count): - _ = u30() # param name + u30() # param name mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) method_infos.append(mi) # Metadata metadata_count = u30() for _c in range(metadata_count): - _ = u30() # name + u30() # name item_count = u30() for _c2 in range(item_count): - _ = u30() # key - _ = u30() # value + u30() # key + u30() # value def parse_traits_info(): trait_name_idx = u30() @@ -799,20 +801,20 @@ def parse_traits_info(): attrs = kind_full >> 4 methods = {} if kind in [0x00, 0x06]: # Slot or Const - _ = u30() # Slot id - type_name_idx = u30() + u30() # Slot id + u30() # type_name_idx vindex = u30() if vindex != 0: - _ = read_byte() # vkind + read_byte() # vkind elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - _ = u30() # disp_id + u30() # disp_id method_idx = u30() methods[multinames[trait_name_idx]] = method_idx elif kind == 0x04: # Class - _ = u30() # slot_id - _ = u30() # classi + u30() # slot_id + u30() # classi elif kind == 0x05: # Function - _ = u30() # slot_id + u30() # slot_id function_idx = u30() methods[function_idx] = multinames[trait_name_idx] else: @@ -821,7 +823,7 @@ def parse_traits_info(): if attrs & 0x4 != 0: # Metadata present metadata_count = u30() for _c3 in range(metadata_count): - _ = u30() + u30() # metadata index return methods @@ -835,17 +837,17 @@ def parse_traits_info(): if name_idx == searched_idx: # We found the class we're looking for! searched_class_id = class_id - _ = u30() # super_name idx + u30() # super_name idx flags = read_byte() if flags & 0x08 != 0: # Protected namespace is present - protected_ns_idx = u30() + u30() # protected_ns_idx intrf_count = u30() for _c2 in range(intrf_count): - _ = u30() - _ = u30() # iinit + u30() + u30() # iinit trait_count = u30() for _c2 in range(trait_count): - _ = parse_traits_info() + parse_traits_info() if searched_class_id is None: raise ExtractorError(u'Target class %r not found' % @@ -854,7 +856,7 @@ def parse_traits_info(): method_names = {} method_idxs = {} for class_id in range(class_count): - _ = u30() # cinit + u30() # cinit trait_count = u30() for _c2 in range(trait_count): trait_methods = parse_traits_info() @@ -867,10 +869,10 @@ def parse_traits_info(): # Scripts script_count = u30() for _c in range(script_count): - _ = u30() # init + u30() # init trait_count = u30() for _c2 in range(trait_count): - _ = parse_traits_info() + parse_traits_info() # Method bodies method_body_count = u30() @@ -878,10 +880,10 @@ def parse_traits_info(): methods = {} for _c in range(method_body_count): method_idx = u30() - max_stack = u30() + u30() # max_stack local_count = u30() - init_scope_depth = u30() - max_scope_depth = u30() + u30() # init_scope_depth + u30() # max_scope_depth code_length = u30() code = read_bytes(code_length) if method_idx in method_idxs: @@ -889,14 +891,14 @@ def parse_traits_info(): methods[method_idxs[method_idx]] = m exception_count = u30() for _c2 in range(exception_count): - _ = u30() # from - _ = u30() # to - _ = u30() # target - _ = u30() # exc_type - _ = u30() # var_name + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name trait_count = u30() for _c2 in range(trait_count): - _ = parse_traits_info() + parse_traits_info() assert p + code_reader.tell() == len(code_tag) assert len(methods) == len(method_idxs) @@ -1011,7 +1013,7 @@ def resfunc(args): assert isinstance(obj, list) stack.append(obj[idx]) elif opcode == 128: # coerce - _ = u30(coder) + u30(coder) elif opcode == 133: # coerce_s assert isinstance(stack[-1], (type(None), compat_str)) elif opcode == 164: # modulo @@ -1055,7 +1057,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): if self._downloader.params.get('youtube_print_sig_code'): self._print_sig_code(func, len(s)) return func(s) - except Exception as e: + except Exception: tb = traceback.format_exc() self._downloader.report_warning( u'Automatic signature extraction failed: ' + tb) From f8061589e66f12f6c2ffac3d7bfba2a7ac0294d5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:50:12 +0200 Subject: [PATCH 11/21] [youtube] Actually pass in cachedir option --- youtube_dl/__init__.py | 3 ++- youtube_dl/extractor/youtube.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 072f69f2e..a4769a8ae 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -618,7 +618,8 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, - 'youtube_print_sig_code': opts.youtube_print_sig_code + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, }) if opts.verbose: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a9bfc455f..2dd2db673 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -438,7 +438,8 @@ def _extract_signature_function(self, video_id, player_url, slen): cache_dir = self._downloader.params.get('cachedir', u'~/.youtube-dl/cache') - if cache_dir != u'NONE': + cache_enabled = cache_dir != u'NONE' + if cache_enabled: cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', func_id + '.json') @@ -465,7 +466,7 @@ def _extract_signature_function(self, video_id, player_url, slen): else: assert False, 'Invalid player type %r' % player_type - if cache_dir is not False: + if cache_enabled: try: cache_res = res(map(compat_chr, range(slen))) cache_spec = [ord(c) for c in cache_res] @@ -515,7 +516,7 @@ def _genslice(start, end, step): cache_spec = [ord(c) for c in cache_res] expr_code = u' + '.join(gen_sig_code(cache_spec)) code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) - self.to_screen(u'Extracted signature:\n' + code) + self.to_screen(u'Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( From c35f9e72ce842ecd476bee3767527da0e675dd1a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 11:09:25 +0200 Subject: [PATCH 12/21] Move cachedir doc --- youtube_dl/FileDownloader.py | 2 -- youtube_dl/YoutubeDL.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 604714134..0b5a5d77d 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -39,8 +39,6 @@ class FileDownloader(object): test: Download only first bytes to test the downloader. min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size - cachedir: Location of the cache files in the filesystem. - "NONE" to disable filesystem cache. """ params = None diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fa24ebe0d..ead1ccb1c 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -81,6 +81,8 @@ class YoutubeDL(object): keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + "NONE" to disable filesystem cache. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: From 13dc64ce741520ba54ba9fff0ab1a3ac4e5c43a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 11:17:21 +0200 Subject: [PATCH 13/21] [youtube] Remove _decrypt_signature_age_gate --- youtube_dl/extractor/youtube.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2dd2db673..56ad33fdc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1109,15 +1109,6 @@ def _static_decrypt_signature(self, s, video_id, player_url, age_gate): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _decrypt_signature_age_gate(self, s): - # The videos with age protection use another player, so the algorithms - # can be different. - if len(s) == 86: - return s[2:63] + s[82] + s[64:82] + s[63] - else: - # Fallback to the other algortihms - return self._decrypt_signature(s) - def _get_available_subtitles(self, video_id): try: sub_list = self._download_webpage( From 45f4a76dbc268a56c212fe25cd27922541840cfe Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 11:45:29 +0200 Subject: [PATCH 14/21] Work around nosetests nosiness --- test/test_youtube_signature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 36533cf1f..5007d9a16 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -43,7 +43,7 @@ def setUp(self): os.mkdir(self.TESTDATA_DIR) -def make_testfunc(url, stype, sig_length, expected_sig): +def make_tfunc(url, stype, sig_length, expected_sig): basename = url.rpartition('/')[2] m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) assert m, '%r should follow URL format' % basename @@ -73,7 +73,7 @@ def test_func(self): setattr(TestSignature, test_func.__name__, test_func) for test_spec in _TESTS: - make_testfunc(*test_spec) + make_tfunc(*test_spec) if __name__ == '__main__': From bdde940e90320e350bd96df621ee7e32641e1eca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 12:17:42 +0200 Subject: [PATCH 15/21] [youtube] Improve flash player URL handling --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 56ad33fdc..888907c93 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1437,10 +1437,12 @@ def _real_extract(self, url): encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): if age_gate: - player_version = self._search_regex( - r'-(.+)\.swf$', - player_url if player_url else None, - 'flash player', fatal=False) + if player_url is None: + player_version = 'unknown' + else: + player_version = self._search_regex( + r'-(.+)\.swf$', player_url, + u'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( From d2d8f895310be7fa302ba7755c60d5948866fcaa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 12:18:10 +0200 Subject: [PATCH 16/21] Do not warn if fallback is without alternatives (because we did not get the flash player URL) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 888907c93..780690ed0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1063,8 +1063,8 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): self._downloader.report_warning( u'Automatic signature extraction failed: ' + tb) - self._downloader.report_warning( - u'Warning: Falling back to static signature algorithm') + self._downloader.report_warning( + u'Warning: Falling back to static signature algorithm') return self._static_decrypt_signature( s, video_id, player_url, age_gate) From c705320f485cd962827fce464a93993569e3173f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 12:18:16 +0200 Subject: [PATCH 17/21] Correct test strings --- youtube_dl/extractor/youtube.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 780690ed0..049da2f91 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -468,7 +468,8 @@ def _extract_signature_function(self, video_id, player_url, slen): if cache_enabled: try: - cache_res = res(map(compat_chr, range(slen))) + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] try: os.makedirs(os.path.dirname(cache_fn)) @@ -512,7 +513,8 @@ def _genslice(start, end, step): else: yield _genslice(start, i, step) - cache_res = func(map(compat_chr, range(slen))) + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = u' + '.join(gen_sig_code(cache_spec)) code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) From c3c88a2664595fd62898e44f8fc93c84e6d3c5a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:04:43 +0200 Subject: [PATCH 18/21] Allow opts.cachedir == None to disable cache --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/__init__.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ead1ccb1c..a3a351ee6 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -82,7 +82,7 @@ class YoutubeDL(object): daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. - "NONE" to disable filesystem cache. + None to disable filesystem cache. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a4769a8ae..ebf4a300f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -618,7 +618,7 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, - 'cachedir': opts.cachedir, + 'cachedir': opts.cachedir if opts.cachedir != 'NONE' else None, 'youtube_print_sig_code': opts.youtube_print_sig_code, }) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 049da2f91..a6eefdf4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -438,7 +438,7 @@ def _extract_signature_function(self, video_id, player_url, slen): cache_dir = self._downloader.params.get('cachedir', u'~/.youtube-dl/cache') - cache_enabled = cache_dir != u'NONE' + cache_enabled = cache_dir is not None if cache_enabled: cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', From e35e4ddc9a4605a63a06c5bb12055bfceacb50b8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:18:03 +0200 Subject: [PATCH 19/21] Fix output of --youtube-print-sig-code when counting down to 0 --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a6eefdf4e..148b20160 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -488,8 +488,8 @@ def _print_sig_code(self, func, slen): def gen_sig_code(idxs): def _genslice(start, end, step): starts = u'' if start == 0 else str(start) - ends = u':%d' % (end+step) - steps = u'' if step == 1 else (':%d' % step) + ends = (u':%d' % (end+step)) if end + step >= 0 else u':' + steps = u'' if step == 1 else (u':%d' % step) return u's[%s%s%s]' % (starts, ends, steps) step = None From f2c327fd39d10115573d709f94f20721a80895fb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:20:42 +0200 Subject: [PATCH 20/21] Fix 86 signature (#1494) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 148b20160..e883a2c54 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1092,7 +1092,7 @@ def _static_decrypt_signature(self, s, video_id, player_url, age_gate): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] + return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] elif len(s) == 85: return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: From 7f747732547fedc876bcdcc77ba53a56324d7e87 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 24 Sep 2013 21:26:10 +0200 Subject: [PATCH 21/21] Add option --no-cache-dir --- youtube_dl/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ebf4a300f..46d0fbd64 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -167,7 +167,12 @@ def _hide_login_info(opts): help='Output descriptions of all supported extractors', default=False) general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') - general.add_option('--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', help='Location in the filesystem where youtube-dl can store downloaded information permanently. NONE to disable filesystem caching, %default by default') + general.add_option( + '--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', + help='Location in the filesystem where youtube-dl can store downloaded information permanently. %default by default') + general.add_option( + '--no-cache-dir', action='store_const', const=None, dest='cachedir', + help='Disable filesystem caching') selection.add_option('--playlist-start', @@ -560,7 +565,7 @@ def _real_main(argv=None): parser.error(u'Cannot download a video and extract audio into the same' u' file! Use "%%(ext)s" instead of %r' % determine_ext(outtmpl, u'')) - + raise ValueError(repr(opts.cachedir)) # YoutubeDL ydl = YoutubeDL({ 'usenetrc': opts.usenetrc, @@ -618,7 +623,7 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, - 'cachedir': opts.cachedir if opts.cachedir != 'NONE' else None, + 'cachedir': opts.cachedir, 'youtube_print_sig_code': opts.youtube_print_sig_code, })