diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index c8ee52014..57888d86e 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -525,6 +525,73 @@ def _extract_storyboard(self, item_id, storyboard_json_url, duration): } for path in images], } + def _download_chat(self, vod_id): + live_chat = list() + + request_url = f'https://api.twitch.tv/v5/videos/{vod_id}/comments' + query_params = { + 'client_id': self._CLIENT_ID + } + + self.to_screen('Downloading chat fragment JSONs') + + # TODO: question: is it OK to use this config value for this purpose? + max_retries = self.get_param('extractor_retries') + retries = 0 + pagenum = 1 + while True: + response_json = self._download_json( + request_url, + vod_id, + fatal=False, + note='Downloading chat fragment JSON page %d' % pagenum, + errnote='Live chat fragment download failed.', + query=query_params) + + if response_json is False: + self.report_warning(f'Unable to fetch next chat history fragment. {retries}. try of {max_retries}') + + if retries < max_retries: + retries += 1 + continue + else: + self.report_warning('Chat history download failed: retry limit reached') + # TODO: when this happens, should I forget a partial chat history, or is it better to keep it too? + # I think if I keep it, it might be better to persist a warning that it is incomplete + # live_chat.clear() + break + + live_chat.extend(response_json.get('comments') or []) + next_fragment_cursor = str_or_none(response_json.get('_next')) + + if next_fragment_cursor is None: + break + + query_params['cursor'] = next_fragment_cursor + pagenum += 1 + + chat_history_length = len(live_chat) + + self.to_screen('Extracted %d chat messages' % chat_history_length) + if chat_history_length == 0: + return None + + return self._extract_chat(live_chat, request_url) + + def _extract_chat(self, chat_history, request_url): + return { + 'live_chat': [ # subtitle tag + { # JSON subformat as URL + 'url': request_url, + 'ext': 'json' + }, + { # JSON subformat as data + 'data': json.dumps(chat_history), + 'ext': 'json' + } + ] + } + def _real_extract(self, url): vod_id = self._match_id(url) @@ -556,16 +623,9 @@ def _real_extract(self, url): if 't' in query: info['start_time'] = parse_duration(query['t'][0]) - if info.get('timestamp') is not None: - info['subtitles'] = { - 'rechat': [{ - 'url': update_url_query( - 'https://api.twitch.tv/v5/videos/%s/comments' % vod_id, { - 'client_id': self._CLIENT_ID, - }), - 'ext': 'json', - }], - } + if ('live_chat' in self.get_param('subtitleslangs', [])) \ + and info.get('timestamp') is not None: + info['subtitles'] = self._download_chat(vod_id) return info