From c35448b7b14113b35c4415dbfbf488c4731f006f Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Thu, 1 Jun 2023 20:43:32 +1200 Subject: [PATCH] [extractor/youtube] Extract more metadata for comments (#7179) Adds new comment fields: * `author_url` - The url to the comment author's page * `author_is_verified` - Whether the author is verified on the platform * `is_pinned` - Whether the comment is pinned to the top of the comments Closes https://github.com/yt-dlp/yt-dlp/issues/5411 Authored by: coletdjnz --- yt_dlp/extractor/common.py | 9 +++-- yt_dlp/extractor/youtube.py | 68 ++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 306911a6c..fa46a5240 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -314,6 +314,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -325,8 +330,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to yt-dlp it should allow to get the same result again. (It will be set diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ae4b58205..ccf97705a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3271,37 +3271,50 @@ def _extract_comment(self, comment_renderer, parent=None): if not comment_id: return - text = self._get_text(comment_renderer, 'contentText') + info = { + 'id': comment_id, + 'text': self._get_text(comment_renderer, 'contentText'), + 'like_count': self._get_count(comment_renderer, 'voteCount'), + 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})), + 'author': self._get_text(comment_renderer, 'authorText'), + 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})), + 'parent': parent or 'root', + } # Timestamp is an estimate calculated from the current time and time_text time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' timestamp = self._parse_time_text(time_text) - author = self._get_text(comment_renderer, 'authorText') - author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) - - votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), str)) or 0 - author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) - - author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) - is_favorited = 'creatorHeart' in (try_get( - comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {}) - return { - 'id': comment_id, - 'text': text, + info.update({ + # FIXME: non-standard, but we need a way of showing that it is an estimate. + '_time_text': time_text, 'timestamp': timestamp, - 'time_text': time_text, - 'like_count': votes, - 'is_favorited': is_favorited, - 'author': author, - 'author_id': author_id, - 'author_thumbnail': author_thumbnail, - 'author_is_uploader': author_is_uploader, - 'parent': parent or 'root' - } + }) + + info['author_url'] = urljoin( + 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))), + expected_type=str, get_all=False)) + + author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner') + if author_is_uploader is not None: + info['author_is_uploader'] = author_is_uploader + + comment_abr = traverse_obj( + comment_renderer, ('actionsButtons', 'commentActionButtonsRenderer'), expected_type=dict) + if comment_abr is not None: + info['is_favorited'] = 'creatorHeart' in comment_abr + + comment_ab_icontype = traverse_obj( + comment_renderer, ('authorCommentBadge', 'authorCommentBadgeRenderer', 'icon', 'iconType')) + if comment_ab_icontype is not None: + info['author_is_verified'] = comment_ab_icontype in ('CHECK_CIRCLE_THICK', 'OFFICIAL_ARTIST_BADGE') + + is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') + if is_pinned: + info['is_pinned'] = True + + return info def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): @@ -3349,14 +3362,13 @@ def extract_thread(contents): comment = self._extract_comment(comment_renderer, parent) if not comment: continue - is_pinned = bool(traverse_obj(comment_renderer, 'pinnedCommentBadge')) comment_id = comment['id'] - if is_pinned: + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. # See: https://github.com/yt-dlp/yt-dlp/issues/6290 if comment_id in tracker['seen_comment_ids']: - if comment_id in tracker['pinned_comment_ids'] and not is_pinned: + if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'): # Pinned comments may appear a second time in newest first sort # See: https://github.com/yt-dlp/yt-dlp/issues/6712 continue