mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-17 05:43:21 +00:00
[extractor/biliIntl] Add comment extraction (#6079)
Authored by: HobbyistDev
This commit is contained in:
parent
2d97d154fe
commit
b093c38cc9
|
@ -26,6 +26,7 @@
|
||||||
srt_subtitles_timecode,
|
srt_subtitles_timecode,
|
||||||
str_or_none,
|
str_or_none,
|
||||||
traverse_obj,
|
traverse_obj,
|
||||||
|
unified_timestamp,
|
||||||
unsmuggle_url,
|
unsmuggle_url,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
urlencode_postdata,
|
urlencode_postdata,
|
||||||
|
@ -996,6 +997,53 @@ class BiliIntlIE(BiliIntlBaseIE):
|
||||||
'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
|
'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
|
||||||
'upload_date': '20221212',
|
'upload_date': '20221212',
|
||||||
'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
|
'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
# episode comment extraction
|
||||||
|
'url': 'https://www.bilibili.tv/en/play/34580/340317',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '340317',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'timestamp': 1604057820,
|
||||||
|
'upload_date': '20201030',
|
||||||
|
'episode_number': 5,
|
||||||
|
'title': 'E5 - My Own Steel',
|
||||||
|
'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
|
||||||
|
'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
|
||||||
|
'episode': 'Episode 5',
|
||||||
|
'comment_count': int,
|
||||||
|
'chapters': [{
|
||||||
|
'start_time': 0,
|
||||||
|
'end_time': 61.0,
|
||||||
|
'title': '<Untitled Chapter 1>'
|
||||||
|
}, {
|
||||||
|
'start_time': 61.0,
|
||||||
|
'end_time': 134.0,
|
||||||
|
'title': 'Intro'
|
||||||
|
}, {
|
||||||
|
'start_time': 1290.0,
|
||||||
|
'end_time': 1379.0,
|
||||||
|
'title': 'Outro'
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'getcomments': True
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
# user generated content comment extraction
|
||||||
|
'url': 'https://www.bilibili.tv/en/video/2045730385',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '2045730385',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
|
||||||
|
'timestamp': 1667891924,
|
||||||
|
'upload_date': '20221108',
|
||||||
|
'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
|
||||||
|
'comment_count': int,
|
||||||
|
'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'getcomments': True
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
# episode id without intro and outro
|
# episode id without intro and outro
|
||||||
|
@ -1055,11 +1103,69 @@ def _extract_video_metadata(self, url, video_id, season_id):
|
||||||
|
|
||||||
# XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
|
# XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
|
||||||
return merge_dicts(
|
return merge_dicts(
|
||||||
self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), {
|
self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
|
||||||
'title': self._html_search_meta('og:title', webpage),
|
'title': self._html_search_meta('og:title', webpage),
|
||||||
'description': self._html_search_meta('og:description', webpage)
|
'description': self._html_search_meta('og:description', webpage)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def _get_comments_reply(self, root_id, next_id=0, display_id=None):
|
||||||
|
comment_api_raw_data = self._download_json(
|
||||||
|
'https://api.bilibili.tv/reply/web/detail', display_id,
|
||||||
|
note=f'Downloading reply comment of {root_id} - {next_id}',
|
||||||
|
query={
|
||||||
|
'platform': 'web',
|
||||||
|
'ps': 20, # comment's reply per page (default: 3)
|
||||||
|
'root': root_id,
|
||||||
|
'next': next_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
|
||||||
|
yield {
|
||||||
|
'author': traverse_obj(replies, ('member', 'name')),
|
||||||
|
'author_id': traverse_obj(replies, ('member', 'mid')),
|
||||||
|
'author_thumbnail': traverse_obj(replies, ('member', 'face')),
|
||||||
|
'text': traverse_obj(replies, ('content', 'message')),
|
||||||
|
'id': replies.get('rpid'),
|
||||||
|
'like_count': int_or_none(replies.get('like_count')),
|
||||||
|
'parent': replies.get('parent'),
|
||||||
|
'timestamp': unified_timestamp(replies.get('ctime_text'))
|
||||||
|
}
|
||||||
|
|
||||||
|
if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
|
||||||
|
yield from self._get_comments_reply(
|
||||||
|
root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
|
||||||
|
|
||||||
|
def _get_comments(self, video_id, ep_id):
|
||||||
|
for i in itertools.count(0):
|
||||||
|
comment_api_raw_data = self._download_json(
|
||||||
|
'https://api.bilibili.tv/reply/web/root', video_id,
|
||||||
|
note=f'Downloading comment page {i + 1}',
|
||||||
|
query={
|
||||||
|
'platform': 'web',
|
||||||
|
'pn': i, # page number
|
||||||
|
'ps': 20, # comment per page (default: 20)
|
||||||
|
'oid': video_id,
|
||||||
|
'type': 3 if ep_id else 1, # 1: user generated content, 3: series content
|
||||||
|
'sort_type': 1, # 1: best, 2: recent
|
||||||
|
})
|
||||||
|
|
||||||
|
for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
|
||||||
|
yield {
|
||||||
|
'author': traverse_obj(replies, ('member', 'name')),
|
||||||
|
'author_id': traverse_obj(replies, ('member', 'mid')),
|
||||||
|
'author_thumbnail': traverse_obj(replies, ('member', 'face')),
|
||||||
|
'text': traverse_obj(replies, ('content', 'message')),
|
||||||
|
'id': replies.get('rpid'),
|
||||||
|
'like_count': int_or_none(replies.get('like_count')),
|
||||||
|
'timestamp': unified_timestamp(replies.get('ctime_text')),
|
||||||
|
'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
|
||||||
|
}
|
||||||
|
if replies.get('count'):
|
||||||
|
yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
|
||||||
|
|
||||||
|
if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
|
||||||
|
break
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
|
season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
|
||||||
video_id = ep_id or aid
|
video_id = ep_id or aid
|
||||||
|
@ -1087,7 +1193,8 @@ def _real_extract(self, url):
|
||||||
**self._extract_video_metadata(url, video_id, season_id),
|
**self._extract_video_metadata(url, video_id, season_id),
|
||||||
'formats': self._get_formats(ep_id=ep_id, aid=aid),
|
'formats': self._get_formats(ep_id=ep_id, aid=aid),
|
||||||
'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
|
'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
|
||||||
'chapters': chapters
|
'chapters': chapters,
|
||||||
|
'__post_extractor': self.extract_comments(video_id, ep_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue