add live chat extraction to separate branch

2024-11-13 20:03:17 +00:00 · 2021-11-04 23:53:30 +01:00 · 2021-11-04 23:53:30 +01:00 · 1c97dfc45d
parent cc0619f62d
commit 1c97dfc45d
1 changed files with 70 additions and 10 deletions
--- a/yt_dlp/extractor/twitch.py
+++ b/yt_dlp/extractor/twitch.py
@ -525,6 +525,73 @@ def _extract_storyboard(self, item_id, storyboard_json_url, duration):
                } for path in images],
            }

+    def _download_chat(self, vod_id):
+        live_chat = list()
+
+        request_url = f'https://api.twitch.tv/v5/videos/{vod_id}/comments'
+        query_params = {
+            'client_id': self._CLIENT_ID
+        }
+
+        self.to_screen('Downloading chat fragment JSONs')
+
+        # TODO: question: is it OK to use this config value for this purpose?
+        max_retries = self.get_param('extractor_retries')
+        retries = 0
+        pagenum = 1
+        while True:
+            response_json = self._download_json(
+                request_url,
+                vod_id,
+                fatal=False,
+                note='Downloading chat fragment JSON page %d' % pagenum,
+                errnote='Live chat fragment download failed.',
+                query=query_params)
+
+            if response_json is False:
+                self.report_warning(f'Unable to fetch next chat history fragment. {retries}. try of {max_retries}')
+
+                if retries < max_retries:
+                    retries += 1
+                    continue
+                else:
+                    self.report_warning('Chat history download failed: retry limit reached')
+                    # TODO: when this happens, should I forget a partial chat history, or is it better to keep it too?
+                    #       I think if I keep it, it might be better to persist a warning that it is incomplete
+                    # live_chat.clear()
+                    break
+
+            live_chat.extend(response_json.get('comments') or [])
+            next_fragment_cursor = str_or_none(response_json.get('_next'))
+
+            if next_fragment_cursor is None:
+                break
+
+            query_params['cursor'] = next_fragment_cursor
+            pagenum += 1
+
+        chat_history_length = len(live_chat)
+
+        self.to_screen('Extracted %d chat messages' % chat_history_length)
+        if chat_history_length == 0:
+            return None
+
+        return self._extract_chat(live_chat, request_url)
+
+    def _extract_chat(self, chat_history, request_url):
+        return {
+            'live_chat': [  # subtitle tag
+                {           # JSON subformat as URL
+                    'url': request_url,
+                    'ext': 'json'
+                },
+                {           # JSON subformat as data
+                    'data': json.dumps(chat_history),
+                    'ext': 'json'
+                }
+            ]
+        }
+
    def _real_extract(self, url):
        vod_id = self._match_id(url)

@ -556,16 +623,9 @@ def _real_extract(self, url):
        if 't' in query:
            info['start_time'] = parse_duration(query['t'][0])

-        if info.get('timestamp') is not None:
-            info['subtitles'] = {
-                'rechat': [{
-                    'url': update_url_query(
-                        'https://api.twitch.tv/v5/videos/%s/comments' % vod_id, {
-                            'client_id': self._CLIENT_ID,
-                        }),
-                    'ext': 'json',
-                }],
-            }
+        if ('live_chat' in self.get_param('subtitleslangs', [])) \
+                and info.get('timestamp') is not None:
+            info['subtitles'] = self._download_chat(vod_id)

        return info