[extractor/biliIntl] Add comment extraction (#6079)

Authored by: HobbyistDev
This commit is contained in:
HobbyistDev 2023-04-13 03:21:57 +09:00 committed by GitHub
parent 2d97d154fe
commit b093c38cc9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -26,6 +26,7 @@
srt_subtitles_timecode, srt_subtitles_timecode,
str_or_none, str_or_none,
traverse_obj, traverse_obj,
unified_timestamp,
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
@ -996,6 +997,53 @@ class BiliIntlIE(BiliIntlBaseIE):
'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
'upload_date': '20221212', 'upload_date': '20221212',
'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
},
}, {
# episode comment extraction
'url': 'https://www.bilibili.tv/en/play/34580/340317',
'info_dict': {
'id': '340317',
'ext': 'mp4',
'timestamp': 1604057820,
'upload_date': '20201030',
'episode_number': 5,
'title': 'E5 - My Own Steel',
'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode': 'Episode 5',
'comment_count': int,
'chapters': [{
'start_time': 0,
'end_time': 61.0,
'title': '<Untitled Chapter 1>'
}, {
'start_time': 61.0,
'end_time': 134.0,
'title': 'Intro'
}, {
'start_time': 1290.0,
'end_time': 1379.0,
'title': 'Outro'
}],
},
'params': {
'getcomments': True
}
}, {
# user generated content comment extraction
'url': 'https://www.bilibili.tv/en/video/2045730385',
'info_dict': {
'id': '2045730385',
'ext': 'mp4',
'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
'timestamp': 1667891924,
'upload_date': '20221108',
'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
'comment_count': int,
'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
},
'params': {
'getcomments': True
} }
}, { }, {
# episode id without intro and outro # episode id without intro and outro
@ -1055,11 +1103,69 @@ def _extract_video_metadata(self, url, video_id, season_id):
# XXX: webpage metadata may not accurate, it just used to not crash when video_data not found # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
return merge_dicts( return merge_dicts(
self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), { self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
'title': self._html_search_meta('og:title', webpage), 'title': self._html_search_meta('og:title', webpage),
'description': self._html_search_meta('og:description', webpage) 'description': self._html_search_meta('og:description', webpage)
}) })
def _get_comments_reply(self, root_id, next_id=0, display_id=None):
comment_api_raw_data = self._download_json(
'https://api.bilibili.tv/reply/web/detail', display_id,
note=f'Downloading reply comment of {root_id} - {next_id}',
query={
'platform': 'web',
'ps': 20, # comment's reply per page (default: 3)
'root': root_id,
'next': next_id,
})
for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
yield {
'author': traverse_obj(replies, ('member', 'name')),
'author_id': traverse_obj(replies, ('member', 'mid')),
'author_thumbnail': traverse_obj(replies, ('member', 'face')),
'text': traverse_obj(replies, ('content', 'message')),
'id': replies.get('rpid'),
'like_count': int_or_none(replies.get('like_count')),
'parent': replies.get('parent'),
'timestamp': unified_timestamp(replies.get('ctime_text'))
}
if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
yield from self._get_comments_reply(
root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
def _get_comments(self, video_id, ep_id):
for i in itertools.count(0):
comment_api_raw_data = self._download_json(
'https://api.bilibili.tv/reply/web/root', video_id,
note=f'Downloading comment page {i + 1}',
query={
'platform': 'web',
'pn': i, # page number
'ps': 20, # comment per page (default: 20)
'oid': video_id,
'type': 3 if ep_id else 1, # 1: user generated content, 3: series content
'sort_type': 1, # 1: best, 2: recent
})
for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
yield {
'author': traverse_obj(replies, ('member', 'name')),
'author_id': traverse_obj(replies, ('member', 'mid')),
'author_thumbnail': traverse_obj(replies, ('member', 'face')),
'text': traverse_obj(replies, ('content', 'message')),
'id': replies.get('rpid'),
'like_count': int_or_none(replies.get('like_count')),
'timestamp': unified_timestamp(replies.get('ctime_text')),
'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
}
if replies.get('count'):
yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
break
def _real_extract(self, url): def _real_extract(self, url):
season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
video_id = ep_id or aid video_id = ep_id or aid
@ -1087,7 +1193,8 @@ def _real_extract(self, url):
**self._extract_video_metadata(url, video_id, season_id), **self._extract_video_metadata(url, video_id, season_id),
'formats': self._get_formats(ep_id=ep_id, aid=aid), 'formats': self._get_formats(ep_id=ep_id, aid=aid),
'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
'chapters': chapters 'chapters': chapters,
'__post_extractor': self.extract_comments(video_id, ep_id)
} }