import base64 import functools import hashlib import itertools import math import time import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from ..dependencies import Cryptodome from ..utils import ( ExtractorError, GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, filter_dict, float_or_none, format_field, int_or_none, join_nonempty, make_archive_id, merge_dicts, mimetype2ext, parse_count, parse_qs, qualities, smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, try_call, unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, ) class BilibiliBaseIE(InfoExtractor): def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } audios = traverse_obj(play_info, ('dash', 'audio', ...)) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), 'acodec': audio.get('codecs'), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'filesize': int_or_none(audio.get('size')) } for audio in audios] formats.extend({ 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) if missing_formats: self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' f'you have to login or become premium member to download them. {self._login_hint()}') return formats def json2srt(self, json_data): srt_data = '' for idx, line in enumerate(json_data.get('body') or []): srt_data += (f'{idx + 1}\n' f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' f'{line["content"]}\n\n') return srt_data def _get_subtitles(self, video_id, aid, cid): subtitles = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', }] } video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) }) return subtitles def _get_chapters(self, aid, cid): chapters = aid and cid and self._download_json( 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, note='Extracting chapters', fatal=False) return traverse_obj(chapters, ('data', 'view_points', ..., { 'title': 'content', 'start_time': 'from', 'end_time': 'to', })) or None def _get_comments(self, aid): for idx in itertools.count(1): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', aid, note=f'Extracting comments from page {idx}', fatal=False), ('data', 'replies')) if not replies: return for children in map(self._get_all_children, replies): yield from children def _get_all_children(self, reply): yield { 'author': traverse_obj(reply, ('member', 'uname')), 'author_id': traverse_obj(reply, ('member', 'mid')), 'id': reply.get('rpid'), 'text': traverse_obj(reply, ('content', 'message')), 'timestamp': reply.get('ctime'), 'parent': reply.get('parent') or 'root', } for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children def _get_episodes_from_season(self, ss_id, url): season_info = self._download_json( 'https://api.bilibili.com/pgc/web/season/section', ss_id, note='Downloading season info', query={'season_id': ss_id}, headers={'Referer': url, **self.geo_verification_headers()}) for entry in traverse_obj(season_info, ( 'result', 'main_section', 'episodes', lambda _, v: url_or_none(v['share_url']) and v['id'])): yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') class BiliBiliIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', 'info_dict': { 'id': 'BV13x41117TL', 'title': '阿滴英文|英文歌分享#6 "Closer', 'ext': 'mp4', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'upload_date': '20170301', 'timestamp': 1488353834, 'like_count': int, 'view_count': int, }, }, { # old av URL version 'url': 'http://www.bilibili.com/video/av1074402/', 'info_dict': { 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'ext': 'mp4', 'uploader': '菊子桑', 'uploader_id': '156160', 'id': 'BV11x411K7CN', 'title': '【金坷垃】金泡沫', 'duration': 308.36, 'upload_date': '20140420', 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', 'like_count': int, 'comment_count': int, 'view_count': int, 'tags': list, }, 'params': {'skip_download': True}, }, { 'note': 'Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797', 'info_dict': { 'id': 'BV1bK411W797', 'title': '物语中的人物是如何吐槽自己的OP的' }, 'playlist_count': 18, 'playlist': [{ 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:11', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, } }] }, { 'note': 'Specific page of Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:11', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, } }, { 'note': 'video has subtitles', 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { 'id': 'BV12N4y1M7rh', 'ext': 'mp4', 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', 'tags': list, 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', 'uploader': '小夫Tech', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'subtitles': 'count:2' }, 'params': {'listsubtitles': True}, }, { 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'view_count': int, 'like_count': int, }, 'params': { 'skip_download': True, }, }, { 'note': 'video has chapter', 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/', 'info_dict': { 'id': 'BV1vL411G7N7', 'ext': 'mp4', 'title': '如何为你的B站视频添加进度条分段', 'timestamp': 1634554558, 'upload_date': '20211018', 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d', 'tags': list, 'uploader': '爱喝咖啡的当麻', 'duration': 669.482, 'uploader_id': '1680903', 'chapters': 'count:6', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, }, { 'note': 'video redirects to festival page', 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', 'info_dict': { 'id': 'BV1wP4y1P72h', 'ext': 'mp4', 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', 'timestamp': 1643947497, 'upload_date': '20220204', 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', 'uploader': '叨叨冯聊音乐', 'duration': 246.719, 'uploader_id': '528182630', 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, }, { 'note': 'newer festival video', 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', 'info_dict': { 'id': 'BV1ay4y1d77f', 'ext': 'mp4', 'title': '【崩坏3新春剧场】为特别的你送上祝福!', 'timestamp': 1674273600, 'upload_date': '20230121', 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', 'uploader': '果蝇轰', 'duration': 1111.722, 'uploader_id': '8469526', 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) is_festival = 'videoData' not in initial_state if is_festival: video_data = initial_state['videoInfo'] else: play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] video_data = initial_state['videoData'] video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, note='Extracting videos in anthology'), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) if is_anthology and not part_id and self._yes_playlist(video_id, video_id): return self.playlist_from_matches( page_list_json, video_id, title, ie=BiliBiliIE, getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: part_id = part_id or 1 title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') festival_info = {} if is_festival: play_info = self._download_json( 'https://api.bilibili.com/x/player/playurl', video_id, query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, note='Extracting festival video formats')['data'] festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), 'like_count': ('videoStatus', 'like', {int_or_none}), 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), }, get_all=False) return { **traverse_obj(initial_state, { 'uploader': ('upData', 'name'), 'uploader_id': ('upData', 'mid', {str_or_none}), 'like_count': ('videoData', 'stat', 'like', {int_or_none}), 'tags': ('tags', ..., 'tag_name'), 'thumbnail': ('videoData', 'pic', {url_or_none}), }), **festival_info, **traverse_obj(video_data, { 'description': 'desc', 'timestamp': ('pubdate', {int_or_none}), 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), 'comment_count': ('stat', 'reply', {int_or_none}), }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), 'subtitles': self.extract_subtitles(video_id, aid, cid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } class BiliBiliBangumiIE(BilibiliBaseIE): _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?Pep\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { 'id': '267851', 'ext': 'mp4', 'series': '鬼灭之刃', 'series_id': '4358', 'season': '鬼灭之刃', 'season_id': '26801', 'season_number': 1, 'episode': '残酷', 'episode_id': '267851', 'episode_number': 1, 'title': '1 残酷', 'duration': 1425.256, 'timestamp': 1554566400, 'upload_date': '20190406', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') headers = {'Referer': url, **self.geo_verification_headers()} play_info = self._download_json( 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, headers=headers) premium_only = play_info.get('code') == -10403 play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} formats = self.extract_formats(play_info) if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') bangumi_info = self._download_json( 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', query={'ep_id': episode_id}, headers=headers)['result'] episode_number, episode_info = next(( (idx, ep) for idx, ep in enumerate(traverse_obj( bangumi_info, ('episodes', ..., {dict})), 1) if str_or_none(ep.get('id')) == episode_id), (1, {})) season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) aid = episode_info.get('aid') return { 'id': video_id, 'formats': formats, **traverse_obj(bangumi_info, { 'series': ('series', 'series_title', {str}), 'series_id': ('series', 'series_id', {str_or_none}), 'thumbnail': ('square_cover', {url_or_none}), }), 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), 'episode': episode_info.get('long_title'), 'episode_id': episode_id, 'episode_number': int_or_none(episode_info.get('title')) or episode_number, 'season_id': str_or_none(season_id), 'season_number': season_number, 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), '__post_extractor': self.extract_comments(aid), 'http_headers': headers, } class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { 'id': '24097891', }, 'playlist_mincount': 25, }] def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) ss_id = self._search_json( r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) class BiliBiliBangumiSeasonIE(BilibiliBaseIE): _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { 'id': '26801' }, 'playlist_mincount': 26 }] def _real_extract(self, url): ss_id = self._match_id(url) return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): def _extract_playlist(self, fetch_page, get_metadata, get_entries): first_page = fetch_page(0) metadata = get_metadata(first_page) paged_list = InAdvancePagedList( lambda idx: get_entries(fetch_page(idx) if idx else first_page), metadata['page_count'], metadata['page_size']) return metadata, paged_list class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)(?P