Merge pull request #12 from siikamiika/youtube-live-chat

Youtube live chat
This commit is contained in:
Tom-Oliver Heidel 2020-08-31 23:47:06 +02:00 committed by GitHub
commit a9c069012f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 138 additions and 17 deletions

View file

@ -1805,6 +1805,14 @@ def ensure_dir_exists(path):
self.report_error('Cannot write annotations file: ' + annofn) self.report_error('Cannot write annotations file: ' + annofn)
return return
def dl(name, info):
fd = get_suitable_downloader(info, self.params)(self, self.params)
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
if self.params.get('verbose'):
self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
return fd.download(name, info)
subtitles_are_requested = any([self.params.get('writesubtitles', False), subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub')]) self.params.get('writeautomaticsub')])
@ -1812,14 +1820,12 @@ def ensure_dir_exists(path):
# subtitles download errors are already managed as troubles in relevant IE # subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE # that way it will silently go on when used with unsupporting IE
subtitles = info_dict['requested_subtitles'] subtitles = info_dict['requested_subtitles']
ie = self.get_info_extractor(info_dict['extractor_key'])
for sub_lang, sub_info in subtitles.items(): for sub_lang, sub_info in subtitles.items():
sub_format = sub_info['ext'] sub_format = sub_info['ext']
sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
else: else:
self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
if sub_info.get('data') is not None: if sub_info.get('data') is not None:
try: try:
# Use newline='' to prevent conversion of newline characters # Use newline='' to prevent conversion of newline characters
@ -1831,11 +1837,11 @@ def ensure_dir_exists(path):
return return
else: else:
try: try:
sub_data = ie._request_webpage( dl(sub_filename, sub_info)
sub_info['url'], info_dict['id'], note=False).read() except (ExtractorError, IOError, OSError, ValueError,
with io.open(encodeFilename(sub_filename), 'wb') as subfile: compat_urllib_error.URLError,
subfile.write(sub_data) compat_http_client.HTTPException,
except (ExtractorError, IOError, OSError, ValueError) as err: socket.error) as err:
self.report_warning('Unable to download subtitle for "%s": %s' % self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, error_to_compat_str(err))) (sub_lang, error_to_compat_str(err)))
continue continue
@ -1856,14 +1862,6 @@ def ensure_dir_exists(path):
if not self.params.get('skip_download', False): if not self.params.get('skip_download', False):
try: try:
def dl(name, info):
fd = get_suitable_downloader(info, self.params)(self, self.params)
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
if self.params.get('verbose'):
self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
return fd.download(name, info)
if info_dict.get('requested_formats') is not None: if info_dict.get('requested_formats') is not None:
downloaded = [] downloaded = []
success = True success = True

View file

@ -8,6 +8,7 @@
from .dash import DashSegmentsFD from .dash import DashSegmentsFD
from .rtsp import RtspFD from .rtsp import RtspFD
from .ism import IsmFD from .ism import IsmFD
from .youtube_live_chat import YoutubeLiveChatReplayFD
from .external import ( from .external import (
get_external_downloader, get_external_downloader,
FFmpegFD, FFmpegFD,
@ -26,6 +27,7 @@
'f4m': F4mFD, 'f4m': F4mFD,
'http_dash_segments': DashSegmentsFD, 'http_dash_segments': DashSegmentsFD,
'ism': IsmFD, 'ism': IsmFD,
'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
} }

View file

@ -0,0 +1,94 @@
from __future__ import division, unicode_literals
import re
import json
from .fragment import FragmentFD
class YoutubeLiveChatReplayFD(FragmentFD):
""" Downloads YouTube live chat replays fragment by fragment """
FD_NAME = 'youtube_live_chat_replay'
def real_download(self, filename, info_dict):
video_id = info_dict['video_id']
self.to_screen('[%s] Downloading live chat' % self.FD_NAME)
test = self.params.get('test', False)
ctx = {
'filename': filename,
'live': True,
'total_frags': None,
}
def dl_fragment(url):
headers = info_dict.get('http_headers', {})
return self._download_fragment(ctx, url, info_dict, headers)
def parse_yt_initial_data(data):
window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});'
var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});'
for patt in window_patt, var_patt:
try:
raw_json = re.search(patt, data).group(1)
return json.loads(raw_json)
except AttributeError:
continue
self._prepare_and_start_frag_download(ctx)
success, raw_fragment = dl_fragment(
'https://www.youtube.com/watch?v={}'.format(video_id))
if not success:
return False
data = parse_yt_initial_data(raw_fragment)
continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
# no data yet but required to call _append_fragment
self._append_fragment(ctx, b'')
first = True
offset = None
while continuation_id is not None:
data = None
if first:
url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id)
success, raw_fragment = dl_fragment(url)
if not success:
return False
data = parse_yt_initial_data(raw_fragment)
else:
url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay'
+ '?continuation={}'.format(continuation_id)
+ '&playerOffsetMs={}'.format(offset - 5000)
+ '&hidden=false'
+ '&pbj=1')
success, raw_fragment = dl_fragment(url)
if not success:
return False
data = json.loads(raw_fragment)['response']
first = False
continuation_id = None
live_chat_continuation = data['continuationContents']['liveChatContinuation']
offset = None
processed_fragment = bytearray()
if 'actions' in live_chat_continuation:
for action in live_chat_continuation['actions']:
if 'replayChatItemAction' in action:
replay_chat_item_action = action['replayChatItemAction']
offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
processed_fragment.extend(
json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
self._append_fragment(ctx, processed_fragment)
if test or offset is None:
break
self._finish_frag_download(ctx)
return True

View file

@ -1435,7 +1435,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
raise ExtractorError( raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e) 'Signature extraction failed: ' + tb, cause=e)
def _get_subtitles(self, video_id, webpage): def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
try: try:
subs_doc = self._download_xml( subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@ -1462,6 +1462,14 @@ def _get_subtitles(self, video_id, webpage):
'ext': ext, 'ext': ext,
}) })
sub_lang_list[lang] = sub_formats sub_lang_list[lang] = sub_formats
if has_live_chat_replay:
sub_lang_list['live_chat'] = [
{
'video_id': video_id,
'ext': 'json',
'protocol': 'youtube_live_chat_replay',
},
]
if not sub_lang_list: if not sub_lang_list:
self._downloader.report_warning('video doesn\'t have subtitles') self._downloader.report_warning('video doesn\'t have subtitles')
return {} return {}
@ -1485,6 +1493,15 @@ def _get_ytplayer_config(self, video_id, webpage):
return self._parse_json( return self._parse_json(
uppercase_escape(config), video_id, fatal=False) uppercase_escape(config), video_id, fatal=False)
def _get_yt_initial_data(self, video_id, webpage):
config = self._search_regex(
(r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
webpage, 'ytInitialData', default=None)
if config:
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
def _get_automatic_captions(self, video_id, webpage): def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
@ -1978,6 +1995,15 @@ def feed_entry(name):
if is_live is None: if is_live is None:
is_live = bool_or_none(video_details.get('isLive')) is_live = bool_or_none(video_details.get('isLive'))
has_live_chat_replay = False
if not is_live:
yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
try:
yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
has_live_chat_replay = True
except (KeyError, IndexError, TypeError):
pass
# Check for "rental" videos # Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
@ -2385,7 +2411,8 @@ def _extract_count(count_name):
or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage) video_subtitles = self.extract_subtitles(
video_id, video_webpage, has_live_chat_replay)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
video_duration = try_get( video_duration = try_get(