diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 1e2070f8c..0bdc98321 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -114,6 +114,54 @@ import ctypes +class ArchiveTree(object): + """Binary search tree for download archive entries""" + def __init__(self, line): + self.left = None + self.right = None + self.line = line + + # Tree insertion + def at_insert(self, line): + cur = self + while True: + if cur.line: + if line < cur.line: + if cur.left is None: + cur.left = ArchiveTree(line) + return + else: + cur = cur.left + continue + elif line > cur.line: + if cur.right is None: + cur.right = ArchiveTree(line) + return + else: + cur = cur.right + continue + else: + # Duplicate line found + return + else: + cur.line = line + return + + def at_exist(self, line): + if self.line is None: + return False + if line < self.line: + if self.left is None: + return False + return self.left.at_exist(line) + elif line > self.line: + if self.right is None: + return False + return self.right.at_exist(line) + else: + return True + + class YoutubeDL(object): """YoutubeDL class. @@ -359,6 +407,39 @@ def __init__(self, params=None, auto_init=True): } self.params.update(params) self.cache = Cache(self) + self.archive = ArchiveTree(None) + + """Preload the archive, if any is specified""" + def preload_download_archive(self): + lines = [] + fn = self.params.get('download_archive') + if fn is None: + return False + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + lines.append(line.strip()) + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + lmax = len(lines) + if lmax > 10: + pos = 0 + while pos < lmax: + if lmax - pos <= 2: + break + target = random.randrange(pos + 1, lmax - 1) + # Swap line at pos with randomly chosen target + temp = lines[pos] + lines[pos] = lines[target] + lines[target] = temp + pos += 1 + elif lmax < 1: + # No lines were loaded + return False + for x in lines: + self.archive.at_insert(x) + return True def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: @@ -367,6 +448,11 @@ def check_deprecated(param, option, suggestion): return True return False + if self.params.get('verbose'): + self.to_stdout('[debug] Loading archive file %r' % self.params.get('download_archive')) + + preload_download_archive(self) + if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): if self.params.get('geo_verification_proxy') is None: self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] @@ -722,7 +808,7 @@ def prepare_filename(self, info_dict): return None def _match_entry(self, info_dict, incomplete): - """ Returns None iff the file should be downloaded """ + """ Returns None if the file should be downloaded """ video_title = info_dict.get('title', info_dict.get('id', 'video')) if 'title' in info_dict: @@ -2142,15 +2228,7 @@ def in_download_archive(self, info_dict): if not vid_id: return False # Incomplete video information - try: - with locked_file(fn, 'r', encoding='utf-8') as archive_file: - for line in archive_file: - if line.strip() == vid_id: - return True - except IOError as ioe: - if ioe.errno != errno.ENOENT: - raise - return False + return self.archive.at_exist(vid_id) def record_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -2160,6 +2238,7 @@ def record_download_archive(self, info_dict): assert vid_id with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + '\n') + self.archive.at_insert(vid_id) @staticmethod def format_resolution(format, default='unknown'):