mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-01-07 08:31:17 +00:00
[youtube] Extract comments' approximate timestamp (#221)
Authored by: colethedj
This commit is contained in:
parent
9e62f283ff
commit
d92f5d5a90
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import calendar
|
||||||
import hashlib
|
import hashlib
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
|
@ -27,6 +28,7 @@
|
||||||
bool_or_none,
|
bool_or_none,
|
||||||
clean_html,
|
clean_html,
|
||||||
dict_get,
|
dict_get,
|
||||||
|
datetime_from_str,
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
format_field,
|
format_field,
|
||||||
float_or_none,
|
float_or_none,
|
||||||
|
@ -46,7 +48,7 @@
|
||||||
update_url_query,
|
update_url_query,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
urlencode_postdata,
|
urlencode_postdata,
|
||||||
urljoin,
|
urljoin
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1499,6 +1501,16 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
|
||||||
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
|
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
|
||||||
regex), webpage, name, default='{}'), video_id, fatal=False)
|
regex), webpage, name, default='{}'), video_id, fatal=False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_time_text(time_text):
|
||||||
|
"""
|
||||||
|
Parse the comment time text
|
||||||
|
time_text is in the format 'X units ago (edited)'
|
||||||
|
"""
|
||||||
|
time_text_split = time_text.split(' ')
|
||||||
|
if len(time_text_split) >= 3:
|
||||||
|
return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _join_text_entries(runs):
|
def _join_text_entries(runs):
|
||||||
text = None
|
text = None
|
||||||
|
@ -1521,7 +1533,7 @@ def _extract_comment(self, comment_renderer, parent=None):
|
||||||
text = self._join_text_entries(comment_text_runs) or ''
|
text = self._join_text_entries(comment_text_runs) or ''
|
||||||
comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
|
comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
|
||||||
time_text = self._join_text_entries(comment_time_text)
|
time_text = self._join_text_entries(comment_time_text)
|
||||||
|
timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
|
||||||
author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
|
author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
|
||||||
author_id = try_get(comment_renderer,
|
author_id = try_get(comment_renderer,
|
||||||
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
|
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
|
||||||
|
@ -1532,11 +1544,10 @@ def _extract_comment(self, comment_renderer, parent=None):
|
||||||
|
|
||||||
author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
|
author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
|
||||||
is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
|
is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': comment_id,
|
'id': comment_id,
|
||||||
'text': text,
|
'text': text,
|
||||||
# TODO: This should be parsed to timestamp
|
'timestamp': timestamp,
|
||||||
'time_text': time_text,
|
'time_text': time_text,
|
||||||
'like_count': votes,
|
'like_count': votes,
|
||||||
'is_favorited': is_liked,
|
'is_favorited': is_liked,
|
||||||
|
@ -1624,12 +1635,12 @@ def extract_thread(parent_renderer):
|
||||||
comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
|
comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
|
||||||
if page_num == 0:
|
if page_num == 0:
|
||||||
if first_continuation:
|
if first_continuation:
|
||||||
note_prefix = "Downloading initial comment continuation page"
|
note_prefix = 'Downloading initial comment continuation page'
|
||||||
else:
|
else:
|
||||||
note_prefix = " Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str)
|
note_prefix = ' Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str)
|
||||||
else:
|
else:
|
||||||
note_prefix = "%sDownloading comment%s page %d %s" % (
|
note_prefix = '%sDownloading comment%s page %d %s' % (
|
||||||
" " if parent else "",
|
' ' if parent else '',
|
||||||
' replies' if parent else '',
|
' replies' if parent else '',
|
||||||
page_num,
|
page_num,
|
||||||
comment_prog_str)
|
comment_prog_str)
|
||||||
|
@ -1644,13 +1655,13 @@ def extract_thread(parent_renderer):
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
|
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
|
||||||
if e.cause.code == 413:
|
if e.cause.code == 413:
|
||||||
self.report_warning("Assumed end of comments (received HTTP Error 413)")
|
self.report_warning('Assumed end of comments (received HTTP Error 413)')
|
||||||
return
|
return
|
||||||
# Downloading page may result in intermittent 5xx HTTP error
|
# Downloading page may result in intermittent 5xx HTTP error
|
||||||
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
|
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
|
||||||
last_error = 'HTTP Error %s' % e.cause.code
|
last_error = 'HTTP Error %s' % e.cause.code
|
||||||
if e.cause.code == 404:
|
if e.cause.code == 404:
|
||||||
last_error = last_error + " (this API is probably deprecated)"
|
last_error = last_error + ' (this API is probably deprecated)'
|
||||||
if count < retries:
|
if count < retries:
|
||||||
continue
|
continue
|
||||||
raise
|
raise
|
||||||
|
@ -1668,7 +1679,7 @@ def extract_thread(parent_renderer):
|
||||||
|
|
||||||
# YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
|
# YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
|
||||||
if browse.get('reload'):
|
if browse.get('reload'):
|
||||||
raise ExtractorError("Invalid or missing params in continuation request", expected=False)
|
raise ExtractorError('Invalid or missing params in continuation request', expected=False)
|
||||||
|
|
||||||
# TODO: not tested, merged from old extractor
|
# TODO: not tested, merged from old extractor
|
||||||
err_msg = browse.get('externalErrorMessage')
|
err_msg = browse.get('externalErrorMessage')
|
||||||
|
@ -1708,7 +1719,7 @@ def extract_thread(parent_renderer):
|
||||||
|
|
||||||
if expected_comment_count:
|
if expected_comment_count:
|
||||||
comment_counts[1] = str_to_int(expected_comment_count)
|
comment_counts[1] = str_to_int(expected_comment_count)
|
||||||
self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count))
|
self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
|
||||||
yield comment_counts[1]
|
yield comment_counts[1]
|
||||||
|
|
||||||
# TODO: cli arg.
|
# TODO: cli arg.
|
||||||
|
@ -1724,7 +1735,7 @@ def extract_thread(parent_renderer):
|
||||||
continuation = YoutubeTabIE._build_continuation_query(
|
continuation = YoutubeTabIE._build_continuation_query(
|
||||||
continuation=sort_continuation_renderer.get('continuation'),
|
continuation=sort_continuation_renderer.get('continuation'),
|
||||||
ctp=sort_continuation_renderer.get('clickTrackingParams'))
|
ctp=sort_continuation_renderer.get('clickTrackingParams'))
|
||||||
self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest'))
|
self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest'))
|
||||||
break
|
break
|
||||||
|
|
||||||
for entry in known_continuation_renderers[key](continuation_renderer):
|
for entry in known_continuation_renderers[key](continuation_renderer):
|
||||||
|
@ -1757,7 +1768,7 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
|
||||||
continue
|
continue
|
||||||
comments.append(comment)
|
comments.append(comment)
|
||||||
break
|
break
|
||||||
self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total))
|
self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
|
||||||
return {
|
return {
|
||||||
'comments': comments,
|
'comments': comments,
|
||||||
'comment_count': len(comments),
|
'comment_count': len(comments),
|
||||||
|
@ -2979,7 +2990,7 @@ def extract_entries(parent_renderer): # this needs to called again for continua
|
||||||
self.report_warning('%s. Retrying ...' % last_error)
|
self.report_warning('%s. Retrying ...' % last_error)
|
||||||
try:
|
try:
|
||||||
response = self._call_api(
|
response = self._call_api(
|
||||||
ep="browse", fatal=True, headers=headers,
|
ep='browse', fatal=True, headers=headers,
|
||||||
video_id='%s page %s' % (item_id, page_num),
|
video_id='%s page %s' % (item_id, page_num),
|
||||||
query={
|
query={
|
||||||
'continuation': continuation['continuation'],
|
'continuation': continuation['continuation'],
|
||||||
|
|
Loading…
Reference in a new issue