add live chat extraction to separate branch

This commit is contained in:
mpeter50 2021-11-04 23:53:30 +01:00
parent cc0619f62d
commit 1c97dfc45d

View file

@ -525,6 +525,73 @@ def _extract_storyboard(self, item_id, storyboard_json_url, duration):
} for path in images],
}
def _download_chat(self, vod_id):
live_chat = list()
request_url = f'https://api.twitch.tv/v5/videos/{vod_id}/comments'
query_params = {
'client_id': self._CLIENT_ID
}
self.to_screen('Downloading chat fragment JSONs')
# TODO: question: is it OK to use this config value for this purpose?
max_retries = self.get_param('extractor_retries')
retries = 0
pagenum = 1
while True:
response_json = self._download_json(
request_url,
vod_id,
fatal=False,
note='Downloading chat fragment JSON page %d' % pagenum,
errnote='Live chat fragment download failed.',
query=query_params)
if response_json is False:
self.report_warning(f'Unable to fetch next chat history fragment. {retries}. try of {max_retries}')
if retries < max_retries:
retries += 1
continue
else:
self.report_warning('Chat history download failed: retry limit reached')
# TODO: when this happens, should I forget a partial chat history, or is it better to keep it too?
# I think if I keep it, it might be better to persist a warning that it is incomplete
# live_chat.clear()
break
live_chat.extend(response_json.get('comments') or [])
next_fragment_cursor = str_or_none(response_json.get('_next'))
if next_fragment_cursor is None:
break
query_params['cursor'] = next_fragment_cursor
pagenum += 1
chat_history_length = len(live_chat)
self.to_screen('Extracted %d chat messages' % chat_history_length)
if chat_history_length == 0:
return None
return self._extract_chat(live_chat, request_url)
def _extract_chat(self, chat_history, request_url):
return {
'live_chat': [ # subtitle tag
{ # JSON subformat as URL
'url': request_url,
'ext': 'json'
},
{ # JSON subformat as data
'data': json.dumps(chat_history),
'ext': 'json'
}
]
}
def _real_extract(self, url):
vod_id = self._match_id(url)
@ -556,16 +623,9 @@ def _real_extract(self, url):
if 't' in query:
info['start_time'] = parse_duration(query['t'][0])
if info.get('timestamp') is not None:
info['subtitles'] = {
'rechat': [{
'url': update_url_query(
'https://api.twitch.tv/v5/videos/%s/comments' % vod_id, {
'client_id': self._CLIENT_ID,
}),
'ext': 'json',
}],
}
if ('live_chat' in self.get_param('subtitleslangs', [])) \
and info.get('timestamp') is not None:
info['subtitles'] = self._download_chat(vod_id)
return info