[extractor/biliIntl] Add comment extraction (#6079)

Authored by: HobbyistDev
2024-11-17 05:43:21 +00:00 · 2023-04-13 03:21:57 +09:00 · 2023-04-13 03:21:57 +09:00 · b093c38cc9
parent 2d97d154fe
commit b093c38cc9
1 changed files with 109 additions and 2 deletions
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -26,6 +26,7 @@
    srt_subtitles_timecode,
    str_or_none,
    traverse_obj,
    unified_timestamp,
    unsmuggle_url,
    url_or_none,
    urlencode_postdata,
@ -996,6 +997,53 @@ class BiliIntlIE(BiliIntlBaseIE):
            'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
            'upload_date': '20221212',
            'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
        },
    }, {
        # episode comment extraction
        'url': 'https://www.bilibili.tv/en/play/34580/340317',
        'info_dict': {
            'id': '340317',
            'ext': 'mp4',
            'timestamp': 1604057820,
            'upload_date': '20201030',
            'episode_number': 5,
            'title': 'E5 - My Own Steel',
            'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
            'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
            'episode': 'Episode 5',
            'comment_count': int,
            'chapters': [{
                'start_time': 0,
                'end_time': 61.0,
                'title': '<Untitled Chapter 1>'
            }, {
                'start_time': 61.0,
                'end_time': 134.0,
                'title': 'Intro'
            }, {
                'start_time': 1290.0,
                'end_time': 1379.0,
                'title': 'Outro'
            }],
        },
        'params': {
            'getcomments': True
        }
    }, {
        # user generated content comment extraction
        'url': 'https://www.bilibili.tv/en/video/2045730385',
        'info_dict': {
            'id': '2045730385',
            'ext': 'mp4',
            'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
            'timestamp': 1667891924,
            'upload_date': '20221108',
            'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
            'comment_count': int,
            'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
        },
        'params': {
            'getcomments': True
        }
    }, {
        # episode id without intro and outro
@ -1055,11 +1103,69 @@ def _extract_video_metadata(self, url, video_id, season_id):
        # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
        return merge_dicts(
-            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), {
+            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
                'title': self._html_search_meta('og:title', webpage),
                'description': self._html_search_meta('og:description', webpage)
            })
    def _get_comments_reply(self, root_id, next_id=0, display_id=None):
        comment_api_raw_data = self._download_json(
            'https://api.bilibili.tv/reply/web/detail', display_id,
            note=f'Downloading reply comment of {root_id} - {next_id}',
            query={
                'platform': 'web',
                'ps': 20,  # comment's reply per page (default: 3)
                'root': root_id,
                'next': next_id,
            })
        for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
            yield {
                'author': traverse_obj(replies, ('member', 'name')),
                'author_id': traverse_obj(replies, ('member', 'mid')),
                'author_thumbnail': traverse_obj(replies, ('member', 'face')),
                'text': traverse_obj(replies, ('content', 'message')),
                'id': replies.get('rpid'),
                'like_count': int_or_none(replies.get('like_count')),
                'parent': replies.get('parent'),
                'timestamp': unified_timestamp(replies.get('ctime_text'))
            }
        if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
            yield from self._get_comments_reply(
                root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
    def _get_comments(self, video_id, ep_id):
        for i in itertools.count(0):
            comment_api_raw_data = self._download_json(
                'https://api.bilibili.tv/reply/web/root', video_id,
                note=f'Downloading comment page {i + 1}',
                query={
                    'platform': 'web',
                    'pn': i,  # page number
                    'ps': 20,  # comment per page (default: 20)
                    'oid': video_id,
                    'type': 3 if ep_id else 1,  # 1: user generated content, 3: series content
                    'sort_type': 1,  # 1: best, 2: recent
                })
            for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
                yield {
                    'author': traverse_obj(replies, ('member', 'name')),
                    'author_id': traverse_obj(replies, ('member', 'mid')),
                    'author_thumbnail': traverse_obj(replies, ('member', 'face')),
                    'text': traverse_obj(replies, ('content', 'message')),
                    'id': replies.get('rpid'),
                    'like_count': int_or_none(replies.get('like_count')),
                    'timestamp': unified_timestamp(replies.get('ctime_text')),
                    'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
                }
                if replies.get('count'):
                    yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
            if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
                break
    def _real_extract(self, url):
        season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
        video_id = ep_id or aid
@ -1087,7 +1193,8 @@ def _real_extract(self, url):
            **self._extract_video_metadata(url, video_id, season_id),
            'formats': self._get_formats(ep_id=ep_id, aid=aid),
            'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
-            'chapters': chapters
+            'chapters': chapters,
            '__post_extractor': self.extract_comments(video_id, ep_id)
        }