mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-08 01:13:15 +00:00
[extractor/youtube] Extract more metadata for comments (#7179)
Adds new comment fields: * `author_url` - The url to the comment author's page * `author_is_verified` - Whether the author is verified on the platform * `is_pinned` - Whether the comment is pinned to the top of the comments Closes https://github.com/yt-dlp/yt-dlp/issues/5411 Authored by: coletdjnz
This commit is contained in:
parent
1c16d9df53
commit
c35448b7b1
|
@ -314,6 +314,11 @@ class InfoExtractor:
|
||||||
* "author" - human-readable name of the comment author
|
* "author" - human-readable name of the comment author
|
||||||
* "author_id" - user ID of the comment author
|
* "author_id" - user ID of the comment author
|
||||||
* "author_thumbnail" - The thumbnail of the comment author
|
* "author_thumbnail" - The thumbnail of the comment author
|
||||||
|
* "author_url" - The url to the comment author's page
|
||||||
|
* "author_is_verified" - Whether the author is verified
|
||||||
|
on the platform
|
||||||
|
* "author_is_uploader" - Whether the comment is made by
|
||||||
|
the video uploader
|
||||||
* "id" - Comment ID
|
* "id" - Comment ID
|
||||||
* "html" - Comment as HTML
|
* "html" - Comment as HTML
|
||||||
* "text" - Plain text of the comment
|
* "text" - Plain text of the comment
|
||||||
|
@ -325,8 +330,8 @@ class InfoExtractor:
|
||||||
* "dislike_count" - Number of negative ratings of the comment
|
* "dislike_count" - Number of negative ratings of the comment
|
||||||
* "is_favorited" - Whether the comment is marked as
|
* "is_favorited" - Whether the comment is marked as
|
||||||
favorite by the video uploader
|
favorite by the video uploader
|
||||||
* "author_is_uploader" - Whether the comment is made by
|
* "is_pinned" - Whether the comment is pinned to
|
||||||
the video uploader
|
the top of the comments
|
||||||
age_limit: Age restriction for the video, as an integer (years)
|
age_limit: Age restriction for the video, as an integer (years)
|
||||||
webpage_url: The URL to the video webpage, if given to yt-dlp it
|
webpage_url: The URL to the video webpage, if given to yt-dlp it
|
||||||
should allow to get the same result again. (It will be set
|
should allow to get the same result again. (It will be set
|
||||||
|
|
|
@ -3271,37 +3271,50 @@ def _extract_comment(self, comment_renderer, parent=None):
|
||||||
if not comment_id:
|
if not comment_id:
|
||||||
return
|
return
|
||||||
|
|
||||||
text = self._get_text(comment_renderer, 'contentText')
|
info = {
|
||||||
|
'id': comment_id,
|
||||||
|
'text': self._get_text(comment_renderer, 'contentText'),
|
||||||
|
'like_count': self._get_count(comment_renderer, 'voteCount'),
|
||||||
|
'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})),
|
||||||
|
'author': self._get_text(comment_renderer, 'authorText'),
|
||||||
|
'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})),
|
||||||
|
'parent': parent or 'root',
|
||||||
|
}
|
||||||
|
|
||||||
# Timestamp is an estimate calculated from the current time and time_text
|
# Timestamp is an estimate calculated from the current time and time_text
|
||||||
time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
|
time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
|
||||||
timestamp = self._parse_time_text(time_text)
|
timestamp = self._parse_time_text(time_text)
|
||||||
|
|
||||||
author = self._get_text(comment_renderer, 'authorText')
|
info.update({
|
||||||
author_id = try_get(comment_renderer,
|
# FIXME: non-standard, but we need a way of showing that it is an estimate.
|
||||||
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str)
|
'_time_text': time_text,
|
||||||
|
|
||||||
votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
|
|
||||||
lambda x: x['likeCount']), str)) or 0
|
|
||||||
author_thumbnail = try_get(comment_renderer,
|
|
||||||
lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str)
|
|
||||||
|
|
||||||
author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
|
|
||||||
is_favorited = 'creatorHeart' in (try_get(
|
|
||||||
comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {})
|
|
||||||
return {
|
|
||||||
'id': comment_id,
|
|
||||||
'text': text,
|
|
||||||
'timestamp': timestamp,
|
'timestamp': timestamp,
|
||||||
'time_text': time_text,
|
})
|
||||||
'like_count': votes,
|
|
||||||
'is_favorited': is_favorited,
|
info['author_url'] = urljoin(
|
||||||
'author': author,
|
'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', (
|
||||||
'author_id': author_id,
|
('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))),
|
||||||
'author_thumbnail': author_thumbnail,
|
expected_type=str, get_all=False))
|
||||||
'author_is_uploader': author_is_uploader,
|
|
||||||
'parent': parent or 'root'
|
author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner')
|
||||||
}
|
if author_is_uploader is not None:
|
||||||
|
info['author_is_uploader'] = author_is_uploader
|
||||||
|
|
||||||
|
comment_abr = traverse_obj(
|
||||||
|
comment_renderer, ('actionsButtons', 'commentActionButtonsRenderer'), expected_type=dict)
|
||||||
|
if comment_abr is not None:
|
||||||
|
info['is_favorited'] = 'creatorHeart' in comment_abr
|
||||||
|
|
||||||
|
comment_ab_icontype = traverse_obj(
|
||||||
|
comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType'))
|
||||||
|
if comment_ab_icontype is not None:
|
||||||
|
info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE')
|
||||||
|
|
||||||
|
is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge')
|
||||||
|
if is_pinned:
|
||||||
|
info['is_pinned'] = True
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None):
|
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None):
|
||||||
|
|
||||||
|
@ -3349,14 +3362,13 @@ def extract_thread(contents):
|
||||||
comment = self._extract_comment(comment_renderer, parent)
|
comment = self._extract_comment(comment_renderer, parent)
|
||||||
if not comment:
|
if not comment:
|
||||||
continue
|
continue
|
||||||
is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge'))
|
|
||||||
comment_id = comment['id']
|
comment_id = comment['id']
|
||||||
if is_pinned:
|
if comment.get('is_pinned'):
|
||||||
tracker['pinned_comment_ids'].add(comment_id)
|
tracker['pinned_comment_ids'].add(comment_id)
|
||||||
# Sometimes YouTube may break and give us infinite looping comments.
|
# Sometimes YouTube may break and give us infinite looping comments.
|
||||||
# See: https://github.com/yt-dlp/yt-dlp/issues/6290
|
# See: https://github.com/yt-dlp/yt-dlp/issues/6290
|
||||||
if comment_id in tracker['seen_comment_ids']:
|
if comment_id in tracker['seen_comment_ids']:
|
||||||
if comment_id in tracker['pinned_comment_ids'] and not is_pinned:
|
if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'):
|
||||||
# Pinned comments may appear a second time in newest first sort
|
# Pinned comments may appear a second time in newest first sort
|
||||||
# See: https://github.com/yt-dlp/yt-dlp/issues/6712
|
# See: https://github.com/yt-dlp/yt-dlp/issues/6712
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Reference in a new issue