fix some issue when downloading non-dash video in bilibili.com

and fix some test params
2024-09-06 19:17:00 +00:00 · 2023-11-09 16:24:52 +08:00 · 2023-11-09 16:24:52 +08:00 · 66db69f511
parent 10025b715e
commit 66db69f511
1 changed files with 139 additions and 71 deletions
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -43,45 +43,72 @@
 class BilibiliBaseIE(InfoExtractor):
    _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?')

-    def extract_formats(self, play_info):
+    def extract_formats(self, play_info, is_dash=True):
        format_names = {
            r['quality']: traverse_obj(r, 'new_description', 'display_desc')
            for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality']))
        }

-        audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict}))
-        flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio'))
-        if flac_audio:
-            audios.append(flac_audio)
-        formats = [{
-            'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'),
-            'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')),
-            'acodec': traverse_obj(audio, ('codecs', {str.lower})),
-            'vcodec': 'none',
-            'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
-            'filesize': int_or_none(audio.get('size')),
-            'format_id': str_or_none(audio.get('id')),
-        } for audio in audios]
+        if not is_dash:
+            formats = []
+            for qn in traverse_obj(play_info, 'videos'):
+                video = traverse_obj(play_info, 'videos').get(qn)

-        formats.extend({
-            'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'),
-            'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')),
-            'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')),
-            'width': int_or_none(video.get('width')),
-            'height': int_or_none(video.get('height')),
-            'vcodec': video.get('codecs'),
-            'acodec': 'none' if audios else None,
-            'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))),
-            'tbr': float_or_none(video.get('bandwidth'), scale=1000),
-            'filesize': int_or_none(video.get('size')),
-            'quality': int_or_none(video.get('id')),
-            'format_id': traverse_obj(
-                video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1),
-                ('id', {str_or_none}), get_all=False),
-            'format': format_names.get(video.get('id')),
-        } for video in traverse_obj(play_info, ('dash', 'video', ...)))
+                segments = []
+                file_total_size = 0
+                for segment in video:
+                    segments.append({
+                        'url': traverse_obj(segment, 'url'),
+                        'duration': float_or_none(traverse_obj(segment, 'length'), scale=1000),
+                        'filesize': int_or_none(traverse_obj(segment, 'size'))
+                    })
+                    file_total_size += traverse_obj(segment, 'size')

-        missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality')))
+                formats.append({
+                    'fragments': segments,
+                    'url': traverse_obj(video[0], 'url'),
+                    'format_id': str(qn),
+                    'duration': float_or_none(play_info.get('timelength'), scale=1000),
+                    'resolution': format_names.get(qn),
+                    'filesize': int_or_none(file_total_size)
+                })
+
+            missing_formats = format_names.keys() - set(traverse_obj(play_info, 'accept_quality'))
+
+        else:
+            audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict}))
+            flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio'))
+            if flac_audio:
+                audios.append(flac_audio)
+            formats = [{
+                'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'),
+                'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')),
+                'acodec': traverse_obj(audio, ('codecs', {str.lower})),
+                'vcodec': 'none',
+                'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
+                'filesize': int_or_none(audio.get('size')),
+                'format_id': str_or_none(audio.get('id')),
+            } for audio in audios]
+
+            formats.extend({
+                               'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'),
+                               'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')),
+                               'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')),
+                               'width': int_or_none(video.get('width')),
+                               'height': int_or_none(video.get('height')),
+                               'vcodec': video.get('codecs'),
+                               'acodec': 'none' if audios else None,
+                               'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))),
+                               'tbr': float_or_none(video.get('bandwidth'), scale=1000),
+                               'filesize': int_or_none(video.get('size')),
+                               'quality': int_or_none(video.get('id')),
+                               'format_id': traverse_obj(
+                                   video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1),
+                                   ('id', {str_or_none}), get_all=False),
+                               'format': format_names.get(video.get('id')),
+                           } for video in traverse_obj(play_info, ('dash', 'video', ...)))
+
+            missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality')))
        if missing_formats:
            self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; '
                           f'you have to login or become premium member to download them. {self._login_hint()}')
@ -157,6 +184,46 @@ def _get_episodes_from_season(self, ss_id, url):
                lambda _, v: url_or_none(v['share_url']) and v['id'])):
            yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}')

+    def _enc_wbi(self, params: dict, video_id=None):
+        if video_id is None:
+            video_id = 0
+        session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav'
+                                           , video_id, note='wbi signature...', fatal=False)
+
+        key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0]
+        img_key = traverse_obj(
+            session_data, ('data', 'wbi_img', 'img_url', {key_from_url}))
+        sub_key = traverse_obj(
+            session_data, ('data', 'wbi_img', 'sub_url', {key_from_url}))
+
+        mixin_key_enc_tab = [
+            46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
+            33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
+            61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
+            36, 20, 34, 44, 52
+        ]
+        mixin_key = functools.reduce(lambda s, i: s + (img_key + sub_key)[i], mixin_key_enc_tab, '')[:32]
+        params['wts'] = round(time.time())
+        params = dict(sorted(params.items()))
+        params = {
+            k: ''.join(filter(lambda char: char not in "!'()*", str(v)))
+            for k, v
+            in params.items()
+        }
+        query = urllib.parse.urlencode(params)
+        params['w_rid'] = hashlib.md5((query + mixin_key).encode()).hexdigest()
+        return params
+
+    def _get_play_url(self, bvid: str, cid: str, headers, qn: int = None, is_dash: bool = True):
+        params = {'bvid': bvid, 'cid': cid, 'fnval': 4048}
+        if not is_dash:
+            params['qn'] = qn
+        return self._download_json(
+            'https://api.bilibili.com/x/player/wbi/playurl', bvid,
+            query=self._enc_wbi(params, bvid if qn is None else bvid + ' qn=' + str(qn)),
+            note='Extracting' + (' dash ' if is_dash else ' non-dash ') + 'video formats',
+            headers=headers)['data']
+

 class BiliBiliIE(BilibiliBaseIE):
    _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
@ -212,7 +279,7 @@ class BiliBiliIE(BilibiliBaseIE):
                'id': 'BV1bK411W797_p1',
                'ext': 'mp4',
                'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
-                'tags': 'count:11',
+                'tags': 'count:10',
                'timestamp': 1589601697,
                'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
                'uploader': '打牌还是打桩',
@ -232,7 +299,7 @@ class BiliBiliIE(BilibiliBaseIE):
            'id': 'BV1bK411W797_p1',
            'ext': 'mp4',
            'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
-            'tags': 'count:11',
+            'tags': 'count:10',
            'timestamp': 1589601697,
            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
            'uploader': '打牌还是打桩',
@ -262,7 +329,7 @@ class BiliBiliIE(BilibiliBaseIE):
            'view_count': int,
            'like_count': int,
            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
-            'subtitles': 'count:2'
+            'subtitles': 'count:1'
        },
        'params': {'listsubtitles': True},
    }, {
@ -343,12 +410,34 @@ class BiliBiliIE(BilibiliBaseIE):
            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
        },
        'params': {'skip_download': True},
+    }, {
+        'note': 'non-dash video',
+        'url': 'https://www.bilibili.com/video/BV1ms411Q7vw/?p=4',
+        'info_dict': {
+            'id': 'BV1ms411Q7vw_p4',
+            'ext': 'mp4' or 'flv',
+            'duration': 6838.493,
+            'view_count': int,
+            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+            'timestamp': 1458222815,
+            'tags': list,
+            'description': '云南方言快乐生产线出品',
+            'comment_count': int,
+            'uploader_id': '3916081',
+            'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛',
+            'like_count': int,
+            'uploader': '一笑颠天',
+            'upload_date': '20160317',
+        },
+        'params': {'skip_download': True}
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
+        headers = {'Referer': url, **self.geo_verification_headers()}
+        play_info = {}

        is_festival = 'videoData' not in initial_state
        if is_festival:
@ -380,15 +469,11 @@ def _real_extract(self, url):

        aid = video_data.get('aid')
        old_video_id = format_field(aid, None, f'%s_part{part_id or 1}')
-
        cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid')

        festival_info = {}
        if is_festival:
-            play_info = self._download_json(
-                'https://api.bilibili.com/x/player/playurl', video_id,
-                query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
-                note='Extracting festival video formats')['data']
+            play_info = self._get_play_url(video_id, cid, headers)

            festival_info = traverse_obj(initial_state, {
                'uploader': ('videoInfo', 'upName'),
@ -397,6 +482,13 @@ def _real_extract(self, url):
                'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'),
            }, get_all=False)

+        is_dash = 'dash' in play_info
+        if not is_dash:
+            play_urls = {}
+            for qn in play_info['accept_quality']:
+                play_urls[qn] = self._get_play_url(video_id, cid, headers, qn, is_dash=False)['durl']
+            play_info['videos'] = play_urls
+
        return {
            **traverse_obj(initial_state, {
                'uploader': ('upData', 'name'),
@ -413,7 +505,7 @@ def _real_extract(self, url):
                'comment_count': ('stat', 'reply', {int_or_none}),
            }, get_all=False),
            'id': f'{video_id}{format_field(part_id, None, "_p%d")}',
-            'formats': self.extract_formats(play_info),
+            'formats': self.extract_formats(play_info) if is_dash else self.extract_formats(play_info, is_dash=False),
            '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None,
            'title': title,
            'duration': float_or_none(play_info.get('timelength'), scale=1000),
@ -546,7 +638,7 @@ def _real_extract(self, url):
        return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id)


-class BilibiliSpaceBaseIE(InfoExtractor):
+class BilibiliSpaceBaseIE(BilibiliBaseIE):
    def _extract_playlist(self, fetch_page, get_metadata, get_entries):
        first_page = fetch_page(0)
        metadata = get_metadata(first_page)
@ -574,37 +666,14 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
        'playlist_mincount': 92,
    }]

-    def _extract_signature(self, playlist_id):
-        session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False)
-
-        key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0]
-        img_key = traverse_obj(
-            session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100'
-        sub_key = traverse_obj(
-            session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6'
-
-        session_key = img_key + sub_key
-
-        signature_values = []
-        for position in (
-            46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39,
-            12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63,
-            57, 62, 11, 36, 20, 34, 44, 52
-        ):
-            char_at_position = try_call(lambda: session_key[position])
-            if char_at_position:
-                signature_values.append(char_at_position)
-
-        return ''.join(signature_values)[:32]
-
    def _real_extract(self, url):
+        headers = {'Referer': url, **self.geo_verification_headers()}
+
        playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
        if not is_video_url:
            self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
                           'To download audios, add a "/audio" to the URL')

-        signature = self._extract_signature(playlist_id)
-
        def fetch_page(page_idx):
            query = {
                'keyword': '',
@ -615,14 +684,13 @@ def fetch_page(page_idx):
                'pn': page_idx + 1,
                'ps': 30,
                'tid': 0,
-                'web_location': 1550101,
-                'wts': int(time.time()),
+                'web_location': 1550101
            }
-            query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest()

            try:
                response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search',
-                                               playlist_id, note=f'Downloading page {page_idx}', query=query)
+                                               playlist_id, note=f'Downloading page {page_idx}',
+                                               query=self._enc_wbi(query, playlist_id), headers=headers)
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 412:
                    raise ExtractorError(