[extractor/youtube] Extract more metadata for comments (#7179)

Adds new comment fields:
* `author_url` - The url to the comment author's page
* `author_is_verified` - Whether the author is verified on the platform
* `is_pinned` - Whether the comment is pinned to the top of the comments

Closes https://github.com/yt-dlp/yt-dlp/issues/5411

Authored by: coletdjnz
This commit is contained in:
coletdjnz 2023-06-01 20:43:32 +12:00 committed by GitHub
parent 1c16d9df53
commit c35448b7b1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 30 deletions

View file

@ -314,6 +314,11 @@ class InfoExtractor:
* "author" - human-readable name of the comment author * "author" - human-readable name of the comment author
* "author_id" - user ID of the comment author * "author_id" - user ID of the comment author
* "author_thumbnail" - The thumbnail of the comment author * "author_thumbnail" - The thumbnail of the comment author
* "author_url" - The url to the comment author's page
* "author_is_verified" - Whether the author is verified
on the platform
* "author_is_uploader" - Whether the comment is made by
the video uploader
* "id" - Comment ID * "id" - Comment ID
* "html" - Comment as HTML * "html" - Comment as HTML
* "text" - Plain text of the comment * "text" - Plain text of the comment
@ -325,8 +330,8 @@ class InfoExtractor:
* "dislike_count" - Number of negative ratings of the comment * "dislike_count" - Number of negative ratings of the comment
* "is_favorited" - Whether the comment is marked as * "is_favorited" - Whether the comment is marked as
favorite by the video uploader favorite by the video uploader
* "author_is_uploader" - Whether the comment is made by * "is_pinned" - Whether the comment is pinned to
the video uploader the top of the comments
age_limit: Age restriction for the video, as an integer (years) age_limit: Age restriction for the video, as an integer (years)
webpage_url: The URL to the video webpage, if given to yt-dlp it webpage_url: The URL to the video webpage, if given to yt-dlp it
should allow to get the same result again. (It will be set should allow to get the same result again. (It will be set

View file

@ -3271,37 +3271,50 @@ def _extract_comment(self, comment_renderer, parent=None):
if not comment_id: if not comment_id:
return return
text = self._get_text(comment_renderer, 'contentText') info = {
'id': comment_id,
'text': self._get_text(comment_renderer, 'contentText'),
'like_count': self._get_count(comment_renderer, 'voteCount'),
'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})),
'author': self._get_text(comment_renderer, 'authorText'),
'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})),
'parent': parent or 'root',
}
# Timestamp is an estimate calculated from the current time and time_text # Timestamp is an estimate calculated from the current time and time_text
time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
timestamp = self._parse_time_text(time_text) timestamp = self._parse_time_text(time_text)
author = self._get_text(comment_renderer, 'authorText') info.update({
author_id = try_get(comment_renderer, # FIXME: non-standard, but we need a way of showing that it is an estimate.
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) '_time_text': time_text,
votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
lambda x: x['likeCount']), str)) or 0
author_thumbnail = try_get(comment_renderer,
lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str)
author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
is_favorited = 'creatorHeart' in (try_get(
comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
return {
'id': comment_id,
'text': text,
'timestamp': timestamp, 'timestamp': timestamp,
'time_text': time_text, })
'like_count': votes,
'is_favorited': is_favorited, info['author_url'] = urljoin(
'author': author, 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', (
'author_id': author_id, ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))),
'author_thumbnail': author_thumbnail, expected_type=str, get_all=False))
'author_is_uploader': author_is_uploader,
'parent': parent or 'root' author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner')
} if author_is_uploader is not None:
info['author_is_uploader'] = author_is_uploader
comment_abr = traverse_obj(
comment_renderer, ('actionsButtons', 'commentActionButtonsRenderer'), expected_type=dict)
if comment_abr is not None:
info['is_favorited'] = 'creatorHeart' in comment_abr
comment_ab_icontype = traverse_obj(
comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType'))
if comment_ab_icontype is not None:
info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE')
is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge')
if is_pinned:
info['is_pinned'] = True
return info
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None):
@ -3349,14 +3362,13 @@ def extract_thread(contents):
comment = self._extract_comment(comment_renderer, parent) comment = self._extract_comment(comment_renderer, parent)
if not comment: if not comment:
continue continue
is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge'))
comment_id = comment['id'] comment_id = comment['id']
if is_pinned: if comment.get('is_pinned'):
tracker['pinned_comment_ids'].add(comment_id) tracker['pinned_comment_ids'].add(comment_id)
# Sometimes YouTube may break and give us infinite looping comments. # Sometimes YouTube may break and give us infinite looping comments.
# See: https://github.com/yt-dlp/yt-dlp/issues/6290 # See: https://github.com/yt-dlp/yt-dlp/issues/6290
if comment_id in tracker['seen_comment_ids']: if comment_id in tracker['seen_comment_ids']:
if comment_id in tracker['pinned_comment_ids'] and not is_pinned: if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'):
# Pinned comments may appear a second time in newest first sort # Pinned comments may appear a second time in newest first sort
# See: https://github.com/yt-dlp/yt-dlp/issues/6712 # See: https://github.com/yt-dlp/yt-dlp/issues/6712
continue continue