From 21f40e75dfc0055ea9cdbd7fe2c46c6f9b561afd Mon Sep 17 00:00:00 2001
From: c-basalt <117849907+c-basalt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 13:34:35 -0400
Subject: [PATCH] [ie/douyutv] Fix extractors (#7652)

Closes #2494, Closes #7295
Authored by: c-basalt
---
 yt_dlp/extractor/douyutv.py | 273 ++++++++++++++++++++++++------------
 1 file changed, 184 insertions(+), 89 deletions(-)
diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py
index fa40844df..ee8893d5a 100644
--- a/yt_dlp/extractor/douyutv.py
+++ b/yt_dlp/extractor/douyutv.py
@@ -1,31 +1,72 @@
 import time
 import hashlib
-import re
 import urllib
+import uuid
 
 from .common import InfoExtractor
+from .openload import PhantomJSwrapper
 from ..utils import (
     ExtractorError,
+    UserNotLive,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    parse_resolution,
+    str_or_none,
+    traverse_obj,
     unescapeHTML,
-    unified_strdate,
+    url_or_none,
+    urlencode_postdata,
     urljoin,
 )
 
 
-class DouyuTVIE(InfoExtractor):
-    IE_DESC = '斗鱼'
+class DouyuBaseIE(InfoExtractor):
+    def _download_cryptojs_md5(self, video_id):
+        for url in [
+            'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
+            'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
+        ]:
+            js_code = self._download_webpage(
+                url, video_id, note='Downloading signing dependency', fatal=False)
+            if js_code:
+                self.cache.store('douyu', 'crypto-js-md5', js_code)
+                return js_code
+        raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
+
+    def _get_cryptojs_md5(self, video_id):
+        return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)
+
+    def _calc_sign(self, sign_func, video_id, a):
+        b = uuid.uuid4().hex
+        c = round(time.time())
+        js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
+        phantom = PhantomJSwrapper(self)
+        result = phantom.execute(js_script, video_id,
+                                 note='Executing JS signing script').strip()
+        return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
+
+    def _search_js_sign_func(self, webpage, fatal=True):
+        # The greedy look-behind ensures last possible script tag is matched
+        return self._search_regex(
+            r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal)
+
+
+class DouyuTVIE(DouyuBaseIE):
+    IE_DESC = '斗鱼直播'
     _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
     _TESTS = [{
-        'url': 'http://www.douyutv.com/iseven',
+        'url': 'https://www.douyu.com/pigff',
         'info_dict': {
-            'id': '17732',
-            'display_id': 'iseven',
-            'ext': 'flv',
-            'title': 're:^清晨醒脑！根本停不下来！ [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-            'description': r're:.*m7show@163\.com.*',
-            'thumbnail': r're:^https?://.*\.png',
-            'uploader': '7师傅',
+            'id': '24422',
+            'display_id': 'pigff',
+            'ext': 'mp4',
+            'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
+            'thumbnail': str,
+            'uploader': 'pigff',
             'is_live': True,
+            'live_status': 'is_live',
         },
         'params': {
             'skip_download': True,
@@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    def _get_sign_func(self, room_id, video_id):
+        return self._download_json(
+            f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
+            note='Getting signing script')['data'][f'room{room_id}']
+
+    def _extract_stream_formats(self, stream_formats):
+        formats = []
+        for stream_info in traverse_obj(stream_formats, (..., 'data')):
+            stream_url = urljoin(
+                traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
+            if stream_url:
+                rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
+                rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False)
+                ext = determine_ext(stream_url)
+                formats.append({
+                    'url': stream_url,
+                    'format_id': str_or_none(rate_id),
+                    'ext': 'mp4' if ext == 'm3u8' else ext,
+                    'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
+                    'quality': rate_id % -10000 if rate_id is not None else None,
+                    **traverse_obj(rate_info, {
+                        'format': ('name', {str_or_none}),
+                        'tbr': ('bit', {int_or_none}),
+                    }),
+                })
+        return formats
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        if video_id.isdigit():
-            room_id = video_id
-        else:
-            page = self._download_webpage(url, video_id)
-            room_id = self._html_search_regex(
-                r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
+        webpage = self._download_webpage(url, video_id)
+        room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')
+
+        if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1':
+            raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
+        if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2':
+            raise UserNotLive(video_id=video_id)
 
         # Grab metadata from API
         params = {
@@ -102,110 +171,136 @@ def _real_extract(self, url):
             'time': int(time.time()),
         }
         params['auth'] = hashlib.md5(
-            f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
-        room = self._download_json(
+            f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
+        room = traverse_obj(self._download_json(
             f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
-            note='Downloading room info', query=params)['data']
+            note='Downloading room info', query=params, fatal=False), 'data')
 
         # 1 = live, 2 = offline
-        if room.get('show_status') == '2':
-            raise ExtractorError('Live stream is offline', expected=True)
+        if traverse_obj(room, 'show_status') == '2':
+            raise UserNotLive(video_id=video_id)
 
-        video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL'))
-        formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id)
+        js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id)
+        form_data = {
+            'rate': 0,
+            **self._calc_sign(js_sign_func, video_id, room_id),
+        }
+        stream_formats = [self._download_json(
+            f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
+            video_id, note="Downloading livestream format",
+            data=urlencode_postdata(form_data))]
 
-        title = unescapeHTML(room['room_name'])
-        description = room.get('show_details')
-        thumbnail = room.get('room_src')
-        uploader = room.get('nickname')
+        for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
+            if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
+                form_data['rate'] = rate_id
+                stream_formats.append(self._download_json(
+                    f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
+                    video_id, note=f'Downloading livestream format {rate_id}',
+                    data=urlencode_postdata(form_data)))
 
         return {
             'id': room_id,
-            'display_id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
+            'formats': self._extract_stream_formats(stream_formats),
             'is_live': True,
-            'subtitles': subs,
-            'formats': formats,
+            **traverse_obj(room, {
+                'display_id': ('url', {str}, {lambda i: i[1:]}),
+                'title': ('room_name', {unescapeHTML}),
+                'description': ('show_details', {str}),
+                'uploader': ('nickname', {str}),
+                'thumbnail': ('room_src', {url_or_none}),
+            })
         }
 
 
-class DouyuShowIE(InfoExtractor):
+class DouyuShowIE(DouyuBaseIE):
     _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
 
     _TESTS = [{
-        'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
-        'md5': '0c2cfd068ee2afe657801269b2d86214',
+        'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
         'info_dict': {
-            'id': 'rjNBdvnVXNzvE2yw',
+            'id': 'mPyq7oVNe5Yv1gLY',
             'ext': 'mp4',
-            'title': '陈一发儿：砒霜 我有个室友系列！04-01 22点场',
-            'duration': 7150.08,
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'uploader': '陈一发儿',
-            'uploader_id': 'XrZwYelr5wbK',
-            'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
-            'upload_date': '20170402',
+            'title': '四川人小时候的味道“蒜苗回锅肉”，传统菜不能丢，要常做来吃',
+            'duration': 633,
+            'thumbnail': str,
+            'uploader': '美食作家王刚V',
+            'uploader_id': 'OVAO4NVx1m7Q',
+            'timestamp': 1661850002,
+            'upload_date': '20220830',
+            'view_count': int,
+            'tags': ['美食', '美食综合'],
         },
     }, {
         'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
         'only_matching': True,
     }]
 
+    _FORMATS = {
+        'super': '原画',
+        'high': '超清',
+        'normal': '高清',
+    }
+
+    _QUALITIES = {
+        'super': -1,
+        'high': -2,
+        'normal': -3,
+    }
+
+    _RESOLUTIONS = {
+        'super': '1920x1080',
+        'high': '1280x720',
+        'normal': '852x480',
+    }
+
     def _real_extract(self, url):
         url = url.replace('vmobile.', 'v.')
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
-        room_info = self._parse_json(self._search_regex(
-            r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
+        video_info = self._search_json(
+            r'<script>\s*window\.\$DATA\s*=', webpage,
+            'video info', video_id, transform_source=js_to_json)
 
-        video_info = None
+        js_sign_func = self._search_js_sign_func(webpage)
+        form_data = {
+            'vid': video_id,
+            **self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']),
+        }
+        url_info = self._download_json(
+            'https://v.douyu.com/api/stream/getStreamUrl', video_id,
+            data=urlencode_postdata(form_data), note="Downloading video formats")
 
-        for trial in range(5):
-            # Sometimes Douyu rejects our request. Let's try it more times
-            try:
-                video_info = self._download_json(
-                    'https://vmobile.douyu.com/video/getInfo', video_id,
-                    query={'vid': video_id},
-                    headers={
-                        'Referer': url,
-                        'x-requested-with': 'XMLHttpRequest',
-                    })
-                break
-            except ExtractorError:
-                self._sleep(1, video_id)
-
-        if not video_info:
-            raise ExtractorError('Can\'t fetch video info')
-
-        formats = self._extract_m3u8_formats(
-            video_info['data']['video_url'], video_id,
-            entry_protocol='m3u8_native', ext='mp4')
-
-        upload_date = unified_strdate(self._html_search_regex(
-            r'<em>上传时间：</em><span>([^<]+)</span>', webpage,
-            'upload date', fatal=False))
-
-        uploader = uploader_id = uploader_url = None
-        mobj = re.search(
-            r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
-            webpage)
-        if mobj:
-            uploader_id, uploader = mobj.groups()
-            uploader_url = urljoin(url, '/author/' + uploader_id)
+        formats = []
+        for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)):
+            video_url = traverse_obj(url, ('url', {url_or_none}))
+            if video_url:
+                ext = determine_ext(video_url)
+                formats.append({
+                    'format': self._FORMATS.get(name),
+                    'format_id': name,
+                    'url': video_url,
+                    'quality': self._QUALITIES.get(name),
+                    'ext': 'mp4' if ext == 'm3u8' else ext,
+                    'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
+                    **parse_resolution(self._RESOLUTIONS.get(name))
+                })
+            else:
+                self.to_screen(
+                    f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')
 
         return {
             'id': video_id,
-            'title': room_info['name'],
             'formats': formats,
-            'duration': room_info.get('duration'),
-            'thumbnail': room_info.get('pic'),
-            'upload_date': upload_date,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
-            'uploader_url': uploader_url,
+            **traverse_obj(video_info, ('DATA', {
+                'title': ('content', 'title', {str}),
+                'uploader': ('content', 'author', {str}),
+                'uploader_id': ('content', 'up_id', {str_or_none}),
+                'duration': ('content', 'video_duration', {int_or_none}),
+                'thumbnail': ('content', 'video_pic', {url_or_none}),
+                'timestamp': ('content', 'create_time', {int_or_none}),
+                'view_count': ('content', 'view_num', {int_or_none}),
+                'tags': ('videoTag', ..., 'tagName', {str}),
+            }))
         }