From 7adae468018a60a2f3ccb4cd1c851a7002217c19 Mon Sep 17 00:00:00 2001
From: mpeter50 <83356418+mpeter50@users.noreply.github.com>
Date: Wed, 28 Dec 2022 18:50:42 +0100
Subject: [PATCH] fix twitch vod chat download

chat download now uses the GraphQL API, instead of the old one that doesn't work anymore
---
 yt_dlp/extractor/twitch.py | 87 ++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 40 deletions(-)
diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py
index e66199200..d6f16f953 100644
--- a/yt_dlp/extractor/twitch.py
+++ b/yt_dlp/extractor/twitch.py
@@ -3,6 +3,7 @@
 import json
 import random
 import re
+import time
 
 from .common import InfoExtractor
 from ..compat import (
@@ -55,6 +56,7 @@ class TwitchBaseIE(InfoExtractor):
         'VideoMetadata': '49b5b8f268cdeb259d75b58dcb0c1a748e3b575003448a2333dc5cdafd49adad',
         'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41',
         'VideoPlayer_VODSeekbarPreviewVideo': '07e99e4d56c5a7c67117a154777b0baf85a5ffefa393b213f4bc712ccaf85dd6',
+        'VideoCommentsByOffsetOrCursor': 'b70a3591ff0f4e0313d126c6a1502d79a1c02baebb288227c582044aa76adf6a',
     }
 
     @property
@@ -526,67 +528,72 @@ def _extract_storyboard(self, item_id, storyboard_json_url, duration):
                 } for path in images],
             }
 
-    def _download_chat(self, vod_id):
-        live_chat = list()
-
-        request_url = f'https://api.twitch.tv/v5/videos/{vod_id}/comments'
-        query_params = {
-            'client_id': self._CLIENT_ID
-        }
-
-        self.to_screen('Downloading chat fragment JSONs')
-
-        # TODO: question: is it OK to use this config value for this purpose?
-        max_retries = self.get_param('extractor_retries')
+    def _extract_chat(self, vod_id):
+        chat_history = []
+        has_more_pages = True
+        retry_sleep = 5
+        max_retries = 3
         retries = 0
         pagenum = 1
-        while True:
-            response_json = self._download_json(
-                request_url,
-                vod_id,
-                fatal=False,
-                note='Downloading chat fragment JSON page %d' % pagenum,
-                errnote='Live chat fragment download failed.',
-                query=query_params)
+        gql_ops = [
+            {
+                'operationName': 'VideoCommentsByOffsetOrCursor',
+                'variables': {
+                    'videoID': vod_id,
+                    # 'cursor': <filled in in subsequent requests>
+                }
+            }
+        ]
 
-            if response_json is False:
-                self.report_warning(f'Unable to fetch next chat history fragment. {retries}. try of {max_retries}')
+        self.to_screen('Downloading chat fragment pages')
+
+        while has_more_pages:
+            response = self._download_gql(vod_id, gql_ops, 'Downloading chat fragment page %d' % pagenum, fatal=False)
+
+            if response is False:
+                self.report_warning(f'Unable to fetch next chat history fragment. {retries + 1}. try of {max_retries}')
 
                 if retries < max_retries:
                     retries += 1
+                    time.sleep(retry_sleep)
                     continue
                 else:
                     self.report_warning('Chat history download failed: retry limit reached')
-                    # TODO: when this happens, should I forget a partial chat history, or is it better to keep it too?
+                    # TODO: when this happens, should I forget a partial chat history, or is it better to keep it?
                     #       I think if I keep it, it might be better to persist a warning that it is incomplete
-                    # live_chat.clear()
+                    # chat_history.clear()
                     break
 
-            live_chat.extend(response_json.get('comments') or [])
-            next_fragment_cursor = str_or_none(response_json.get('_next'))
+            comments_obj = traverse_obj(response, (0, 'data', 'video', 'comments'))
+            chat_history.extend(traverse_obj(comments_obj, ('edges', slice, 'node')))
 
-            if next_fragment_cursor is None:
-                break
+            has_more_pages = traverse_obj(comments_obj, ('pageInfo', 'hasNextPage'))
 
-            query_params['cursor'] = next_fragment_cursor
-            pagenum += 1
+            if has_more_pages:
+                cursor = traverse_obj(comments_obj, ('edges', 0, 'cursor'))
+                if cursor is None:
+                    self.report_warning("Cannot continue downloading chat history: cursor is missing. There are additional chat pages to download.")
+                    break
 
-        chat_history_length = len(live_chat)
+                pagenum += 1
+                gql_ops[0]['variables']['cursor'] = cursor
 
+            if has_more_pages is None:
+                cursor = traverse_obj(comments_obj, ('edges', 0, 'cursor'))
+
+                if cursor is not None:
+                    self.report_warning("Next page indication is missing, but found cursor. Continuing chat history download.")
+                else:  # In this case maintenance might be needed. Purpose is to prevent silent errors.
+                    self.report_warning("Next page indication is missing, and cursor not found.")
+
+        chat_history_length = len(chat_history)
         self.to_screen('Extracted %d chat messages' % chat_history_length)
         if chat_history_length == 0:
             return None
 
-        return self._extract_chat(live_chat, request_url)
-
-    def _extract_chat(self, chat_history, request_url):
         return {
             'live_chat': [  # subtitle tag
-                {           # JSON subformat as URL
-                    'url': request_url,
-                    'ext': 'json'
-                },
-                {           # JSON subformat as data
+                {
                     'data': json.dumps(chat_history),
                     'ext': 'json'
                 }
@@ -626,7 +633,7 @@ def _real_extract(self, url):
 
         if ('live_chat' in self.get_param('subtitleslangs', [])) \
                 and info.get('timestamp') is not None:
-            info['subtitles'] = self._download_chat(vod_id)
+            info['subtitles'] = self._extract_chat(vod_id)
 
         return info