From d92f5d5a9005a7a6df7bc081f64d662c70a3f3cf Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Wed, 7 Apr 2021 11:37:43 +0000 Subject: [PATCH] [youtube] Extract comments' approximate timestamp (#221) Authored by: colethedj --- yt_dlp/extractor/youtube.py | 41 +++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 016750a70f..c3d06b9678 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals +import calendar import hashlib import itertools import json @@ -27,6 +28,7 @@ bool_or_none, clean_html, dict_get, + datetime_from_str, ExtractorError, format_field, float_or_none, @@ -46,7 +48,7 @@ update_url_query, url_or_none, urlencode_postdata, - urljoin, + urljoin ) @@ -1499,6 +1501,16 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name): (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), regex), webpage, name, default='{}'), video_id, fatal=False) + @staticmethod + def parse_time_text(time_text): + """ + Parse the comment time text + time_text is in the format 'X units ago (edited)' + """ + time_text_split = time_text.split(' ') + if len(time_text_split) >= 3: + return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') + @staticmethod def _join_text_entries(runs): text = None @@ -1521,7 +1533,7 @@ def _extract_comment(self, comment_renderer, parent=None): text = self._join_text_entries(comment_text_runs) or '' comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or [] time_text = self._join_text_entries(comment_time_text) - + timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple()) author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str) author_id = try_get(comment_renderer, lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) @@ -1532,11 +1544,10 @@ def _extract_comment(self, comment_renderer, parent=None): author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool) - return { 'id': comment_id, 'text': text, - # TODO: This should be parsed to timestamp + 'timestamp': timestamp, 'time_text': time_text, 'like_count': votes, 'is_favorited': is_liked, @@ -1624,12 +1635,12 @@ def extract_thread(parent_renderer): comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1]) if page_num == 0: if first_continuation: - note_prefix = "Downloading initial comment continuation page" + note_prefix = 'Downloading initial comment continuation page' else: - note_prefix = " Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str) + note_prefix = ' Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str) else: - note_prefix = "%sDownloading comment%s page %d %s" % ( - " " if parent else "", + note_prefix = '%sDownloading comment%s page %d %s' % ( + ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) @@ -1644,13 +1655,13 @@ def extract_thread(parent_renderer): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413): if e.cause.code == 413: - self.report_warning("Assumed end of comments (received HTTP Error 413)") + self.report_warning('Assumed end of comments (received HTTP Error 413)') return # Downloading page may result in intermittent 5xx HTTP error # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 last_error = 'HTTP Error %s' % e.cause.code if e.cause.code == 404: - last_error = last_error + " (this API is probably deprecated)" + last_error = last_error + ' (this API is probably deprecated)' if count < retries: continue raise @@ -1668,7 +1679,7 @@ def extract_thread(parent_renderer): # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth) if browse.get('reload'): - raise ExtractorError("Invalid or missing params in continuation request", expected=False) + raise ExtractorError('Invalid or missing params in continuation request', expected=False) # TODO: not tested, merged from old extractor err_msg = browse.get('externalErrorMessage') @@ -1708,7 +1719,7 @@ def extract_thread(parent_renderer): if expected_comment_count: comment_counts[1] = str_to_int(expected_comment_count) - self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count)) + self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count)) yield comment_counts[1] # TODO: cli arg. @@ -1724,7 +1735,7 @@ def extract_thread(parent_renderer): continuation = YoutubeTabIE._build_continuation_query( continuation=sort_continuation_renderer.get('continuation'), ctp=sort_continuation_renderer.get('clickTrackingParams')) - self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest')) + self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest')) break for entry in known_continuation_renderers[key](continuation_renderer): @@ -1757,7 +1768,7 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token): continue comments.append(comment) break - self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total)) + self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total)) return { 'comments': comments, 'comment_count': len(comments), @@ -2979,7 +2990,7 @@ def extract_entries(parent_renderer): # this needs to called again for continua self.report_warning('%s. Retrying ...' % last_error) try: response = self._call_api( - ep="browse", fatal=True, headers=headers, + ep='browse', fatal=True, headers=headers, video_id='%s page %s' % (item_id, page_num), query={ 'continuation': continuation['continuation'],