[bilibili] Add anthology support

Closes: #118 Co-authored by: animelover1984
2025-01-05 15:44:21 +00:00 · 2021-03-20 14:37:43 +05:30 · 2021-03-20 14:37:43 +05:30 · adc74b3c6d
parent beb4b92a66
commit adc74b3c6d
1 changed files with 49 additions and 5 deletions
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -7,6 +7,7 @@

 from .common import InfoExtractor, SearchInfoExtractor
 from ..compat import (
+    compat_str,
    compat_parse_qs,
    compat_urlparse,
 )
@ -15,6 +16,7 @@
    int_or_none,
    float_or_none,
    parse_iso8601,
+    try_get,
    smuggle_url,
    str_or_none,
    strip_jsonp,
@ -113,6 +115,13 @@ class BiliBiliIE(InfoExtractor):
        # new BV video id format
        'url': 'https://www.bilibili.com/video/BV1JE411F741',
        'only_matching': True,
+    }, {
+        # Anthology
+        'url': 'https://www.bilibili.com/video/BV1bK411W797',
+        'info_dict': {
+            'id': 'BV1bK411W797',
+        },
+        'playlist_count': 17,
    }]

    _APP_KEY = 'iVGUTjsxvpLeuDCf'
@ -139,9 +148,19 @@ def _real_extract(self, url):
        page_id = mobj.group('page')
        webpage = self._download_webpage(url, video_id)

+        # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
+        # If the video has no page argument, check to see if it's an anthology
+        if page_id is None:
+            if not self._downloader.params.get('noplaylist'):
+                r = self._extract_anthology_entries(bv_id, video_id, webpage)
+                if r is not None:
+                    self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id)
+                    return r
+            self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
        if 'anime/' not in url:
            cid = self._search_regex(
-                r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid',
+                r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + compat_str(page_id), webpage, 'cid',
                default=None
            ) or self._search_regex(
                r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
@ -224,7 +243,18 @@ def _real_extract(self, url):
        title = self._html_search_regex(
            (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
             r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
-            group='title') + ('_p' + str(page_id) if page_id is not None else '')
+            group='title')
+
+        # Get part title for anthologies
+        if page_id is not None:
+            # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
+            part_title = try_get(
+                self._download_json(
+                    "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id,
+                    video_id, note='Extracting videos in anthology'),
+                lambda x: x['data'][int(page_id) - 1]['part'])
+            title = part_title or title
+
        description = self._html_search_meta('description', webpage)
        timestamp = unified_timestamp(self._html_search_regex(
            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
@ -234,7 +264,7 @@ def _real_extract(self, url):

        # TODO 'view_count' requires deobfuscating Javascript
        info = {
-            'id': str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id),
+            'id': compat_str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id),
            'cid': cid,
            'title': title,
            'description': description,
@ -300,7 +330,7 @@ def get_comments():

            global_info = {
                '_type': 'multi_video',
-                'id': video_id,
+                'id': compat_str(video_id),
                'bv_id': bv_id,
                'title': title,
                'description': description,
@ -312,6 +342,20 @@ def get_comments():

            return global_info

+    def _extract_anthology_entries(self, bv_id, video_id, webpage):
+        title = self._html_search_regex(
+            (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
+            group='title')
+        json_data = self._download_json(
+            "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id,
+            video_id, note='Extracting videos in anthology')
+
+        if len(json_data['data']) > 1:
+            return self.playlist_from_matches(
+                json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(),
+                getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page']))
+
    def _get_video_id_set(self, id, is_bv):
        query = {'bvid': id} if is_bv else {'aid': id}
        response = self._download_json(
@ -506,7 +550,7 @@ def _get_n_results(self, query, n):

            videos = data['result']
            for video in videos:
-                e = self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
+                e = self.url_result(video['arcurl'], 'BiliBili', compat_str(video['aid']))
                entries.append(e)

            if(len(entries) >= n or len(videos) >= BiliBiliSearchIE.MAX_NUMBER_OF_RESULTS):