[twitch] Refactor and add support for past broadcasts

This commit is contained in:
Sergey M․ 2015-01-21 22:27:21 +06:00
parent 47e0e1e0e2
commit c5db6bb32b
2 changed files with 184 additions and 169 deletions

View file

@ -458,7 +458,13 @@
from .tvp import TvpIE, TvpSeriesIE from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE from .tvplay import TVPlayIE
from .twentyfourvideo import TwentyFourVideoIE from .twentyfourvideo import TwentyFourVideoIE
from .twitch import TwitchIE from .twitch import (
TwitchVideoIE,
TwitchChapterIE,
TwitchVodIE,
TwitchProfileIE,
TwitchPastBroadcastsIE,
)
from .ubu import UbuIE from .ubu import UbuIE
from .udemy import ( from .udemy import (
UdemyIE, UdemyIE,

View file

@ -15,44 +15,11 @@
) )
class TwitchIE(InfoExtractor): class TwitchBaseIE(InfoExtractor):
# TODO: One broadcast may be split into multiple videos. The key _VALID_URL_BASE = r'http://(?:www\.)?twitch\.tv'
# 'broadcast_id' is the same for all parts, and 'broadcast_part'
# starts at 1 and increases. Can we treat all parts as one video?
_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
(?:
(?P<channelid>[^/]+)|
(?:(?:[^/]+)/v/(?P<vodid>[^/]+))|
(?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
)
/?(?:\#.*)?$
"""
_PAGE_LIMIT = 100
_API_BASE = 'https://api.twitch.tv' _API_BASE = 'https://api.twitch.tv'
_LOGIN_URL = 'https://secure.twitch.tv/user/login' _LOGIN_URL = 'https://secure.twitch.tv/user/login'
_TESTS = [{
'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}, {
'url': 'http://www.twitch.tv/acracingleague/c/5285812',
'info_dict': {
'id': 'c5285812',
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
}, {
'url': 'http://www.twitch.tv/vanillatv',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}]
def _handle_error(self, response): def _handle_error(self, response):
if not isinstance(response, dict): if not isinstance(response, dict):
@ -64,71 +31,10 @@ def _handle_error(self, response):
expected=True) expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'): def _download_json(self, url, video_id, note='Downloading JSON metadata'):
response = super(TwitchIE, self)._download_json(url, video_id, note) response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
self._handle_error(response) self._handle_error(response)
return response return response
def _extract_media(self, item, item_id):
ITEMS = {
'a': 'video',
'v': 'vod',
'c': 'chapter',
}
info = self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % ITEMS[item]))
if item == 'v':
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % ITEMS[item])
formats = self._extract_m3u8_formats(
'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
% (item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
info['formats'] = formats
return info
response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s playlist JSON' % ITEMS[item])
entries = []
chunks = response['chunks']
qualities = list(chunks.keys())
for num, fragment in enumerate(zip(*chunks.values()), start=1):
formats = []
for fmt_num, fragment_fmt in enumerate(fragment):
format_id = qualities[fmt_num]
fmt = {
'url': fragment_fmt['url'],
'format_id': format_id,
'quality': 1 if format_id == 'live' else 0,
}
m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
if m:
fmt['height'] = int(m.group('height'))
formats.append(fmt)
self._sort_formats(formats)
entry = dict(info)
entry['id'] = '%s_%d' % (entry['id'], num)
entry['title'] = '%s part %d' % (entry['title'], num)
entry['formats'] = formats
entries.append(entry)
return self.playlist_result(entries, info['id'], info['title'])
def _extract_info(self, info):
return {
'id': info['_id'],
'title': info['title'],
'description': info['description'],
'duration': info['length'],
'thumbnail': info['preview'],
'uploader': info['channel']['display_name'],
'uploader_id': info['channel']['name'],
'timestamp': parse_iso8601(info['recorded_at']),
'view_count': info['views'],
}
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -167,81 +73,184 @@ def _login(self):
raise ExtractorError( raise ExtractorError(
'Unable to login: %s' % m.group('msg').strip(), expected=True) 'Unable to login: %s' % m.group('msg').strip(), expected=True)
class TwitchItemBaseIE(TwitchBaseIE):
def _download_info(self, item, item_id):
return self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % self._ITEM_TYPE))
def _extract_media(self, item_id):
info = self._download_info(self._ITEM_SHORTCUT, item_id)
response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
'Downloading %s playlist JSON' % self._ITEM_TYPE)
entries = []
chunks = response['chunks']
qualities = list(chunks.keys())
for num, fragment in enumerate(zip(*chunks.values()), start=1):
formats = []
for fmt_num, fragment_fmt in enumerate(fragment):
format_id = qualities[fmt_num]
fmt = {
'url': fragment_fmt['url'],
'format_id': format_id,
'quality': 1 if format_id == 'live' else 0,
}
m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
if m:
fmt['height'] = int(m.group('height'))
formats.append(fmt)
self._sort_formats(formats)
entry = dict(info)
entry['id'] = '%s_%d' % (entry['id'], num)
entry['title'] = '%s part %d' % (entry['title'], num)
entry['formats'] = formats
entries.append(entry)
return self.playlist_result(entries, info['id'], info['title'])
def _extract_info(self, info):
return {
'id': info['_id'],
'title': info['title'],
'description': info['description'],
'duration': info['length'],
'thumbnail': info['preview'],
'uploader': info['channel']['display_name'],
'uploader_id': info['channel']['name'],
'timestamp': parse_iso8601(info['recorded_at']),
'view_count': info['views'],
}
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) return self._extract_media(self._match_id(url))
if mobj.group('chapterid'):
return self._extract_media('c', mobj.group('chapterid'))
"""
webpage = self._download_webpage(url, chapter_id)
m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
if not m:
raise ExtractorError('Cannot find archive of a chapter')
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id class TwitchVideoIE(TwitchItemBaseIE):
doc = self._download_xml( IE_NAME = 'twitch:video'
api, chapter_id, _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
note='Downloading chapter information', _ITEM_TYPE = 'video'
errnote='Chapter information download failed') _ITEM_SHORTCUT = 'a'
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break
else:
raise ExtractorError('Could not find chapter in chapter information')
video_url = a.find('./video_file_url').text _TEST = {
video_ext = video_url.rpartition('.')[2] or 'flv' 'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
'id': 'a577357806',
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
}
chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
chapter_info = self._download_json(
chapter_api_url, 'c' + chapter_id,
note='Downloading chapter metadata',
errnote='Download of chapter metadata failed')
bracket_start = int(doc.find('.//bracket_start').text) class TwitchChapterIE(TwitchItemBaseIE):
bracket_end = int(doc.find('.//bracket_end').text) IE_NAME = 'twitch:chapter'
_VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'chapter'
_ITEM_SHORTCUT = 'c'
# TODO determine start (and probably fix up file) _TEST = {
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 'url': 'http://www.twitch.tv/acracingleague/c/5285812',
#video_url += '?start=' + TODO:start_timestamp 'info_dict': {
# bracket_start is 13290, but we want 51670615 'id': 'c5285812',
self._downloader.report_warning('Chapter detected, but we can just download the whole file. ' 'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) },
'playlist_mincount': 3,
}
info = {
'id': 'c' + chapter_id, class TwitchVodIE(TwitchItemBaseIE):
'url': video_url, IE_NAME = 'twitch:vod'
'ext': video_ext, _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
'title': chapter_info['title'], _ITEM_TYPE = 'vod'
'thumbnail': chapter_info['preview'], _ITEM_SHORTCUT = 'v'
'description': chapter_info['description'],
'uploader': chapter_info['channel']['display_name'], _TEST = {
'uploader_id': chapter_info['channel']['name'], 'url': 'http://www.twitch.tv/ksptv/v/3622000',
} 'info_dict': {
return info 'id': 'v3622000',
""" 'ext': 'mp4',
elif mobj.group('videoid'): 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
return self._extract_media('a', mobj.group('videoid')) 'thumbnail': 're:^https?://.*\.jpg$',
elif mobj.group('vodid'): 'duration': 6951,
return self._extract_media('v', mobj.group('vodid')) 'timestamp': 1419028564,
elif mobj.group('channelid'): 'upload_date': '20141219',
channel_id = mobj.group('channelid') 'uploader': 'KSPTV',
info = self._download_json( 'uploader_id': 'ksptv',
'%s/kraken/channels/%s' % (self._API_BASE, channel_id), 'view_count': int,
channel_id, 'Downloading channel info JSON') },
channel_name = info.get('display_name') or info.get('name') 'params': {
entries = [] # m3u8 download
offset = 0 'skip_download': True,
limit = self._PAGE_LIMIT },
for counter in itertools.count(1): }
response = self._download_json(
'%s/kraken/channels/%s/videos/?offset=%d&limit=%d' def _real_extract(self, url):
% (self._API_BASE, channel_id, offset, limit), item_id = self._match_id(url)
channel_id, 'Downloading channel videos JSON page %d' % counter) info = self._download_info(self._ITEM_SHORTCUT, item_id)
videos = response['videos'] access_token = self._download_json(
if not videos: '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
break 'Downloading %s access token' % self._ITEM_TYPE)
entries.extend([self.url_result(video['url'], 'Twitch') for video in videos]) formats = self._extract_m3u8_formats(
offset += limit 'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
return self.playlist_result(entries, channel_id, channel_name) % (item_id, access_token['token'], access_token['sig']),
item_id, 'mp4')
info['formats'] = formats
return info
class TwitchPlaylistBaseIE(TwitchBaseIE):
_PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
_PAGE_LIMIT = 100
def _extract_playlist(self, channel_id):
info = self._download_json(
'%s/kraken/channels/%s' % (self._API_BASE, channel_id),
channel_id, 'Downloading channel info JSON')
channel_name = info.get('display_name') or info.get('name')
entries = []
offset = 0
limit = self._PAGE_LIMIT
for counter in itertools.count(1):
response = self._download_json(
self._PLAYLIST_URL % (channel_id, offset, limit),
channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
videos = response['videos']
if not videos:
break
entries.extend([self.url_result(video['url']) for video in videos])
offset += limit
return self.playlist_result(entries, channel_id, channel_name)
def _real_extract(self, url):
return self._extract_playlist(self._match_id(url))
class TwitchProfileIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:profile'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_TYPE = 'profile'
_TEST = {
'url': 'http://www.twitch.tv/vanillatv/profile',
'info_dict': {
'id': 'vanillatv',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
}
class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:profile'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
_PLAYLIST_TYPE = 'past broadcasts'
_TEST = {
'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
'info_dict': {
'id': 'spamfish',
'title': 'Spamfish',
},
'playlist_mincount': 54,
}