From 706dfe441b3cf01c0e2b294afc7d293211a74e94 Mon Sep 17 00:00:00 2001 From: Jeff Huffman Date: Fri, 28 Jan 2022 16:33:51 -0800 Subject: [PATCH] [crunchyroll:beta] Add cookies support (#2506) * Extract directly from the beta API when cookies are passed. If login cookie is absent, the extraction is delegated to `CrunchyrollIE`. This causes different metadata to be extracted (including formats and video id) and therefore results in a different archive entry. For now, this issue is unavoidable since the browser also redirects to the old site when not logged in. * Adds extractor-args `format` and `hardsub` to control the source and subtitles of the extracted formats Closes #1911 Authored by: tejing1 --- README.md | 5 ++ yt_dlp/extractor/crunchyroll.py | 123 ++++++++++++++++++++++++++++++-- 2 files changed, 121 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index db31c55ee0..ae09262c37 100644 --- a/README.md +++ b/README.md @@ -1670,6 +1670,11 @@ #### crunchyroll * `language`: Languages to extract. Eg: `crunchyroll:language=jaJp` * `hardsub`: Which hard-sub versions to extract. Eg: `crunchyroll:hardsub=None,enUS` +#### crunchyroll:beta +* `format`: Which stream type(s) to extract. Default is `adaptive_hls` Eg: `crunchyrollbeta:format=vo_adaptive_hls` + * Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `trailer_hls`, `trailer_dash` +* `hardsub`: Preference order for which hardsub versions to extract. Default is `None` (no hardsubs). Eg: `crunchyrollbeta:hardsub=en-US,None` + #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index cd35728e58..5253e7e4ba 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re import json import zlib @@ -23,13 +24,16 @@ bytes_to_intlist, extract_attributes, float_or_none, + format_field, intlist_to_bytes, int_or_none, + join_nonempty, lowercase_escape, merge_dicts, qualities, remove_end, sanitized_Request, + traverse_obj, try_get, urlencode_postdata, xpath_text, @@ -733,13 +737,118 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') webpage = self._download_webpage(url, display_id) - episode_data = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), - display_id)['content']['byId'][internal_id] - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) + initial_state = self._parse_json( + self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), + display_id) + episode_data = initial_state['content']['byId'][internal_id] + if not self._get_cookies(url).get('etp_rt'): + video_id = episode_data['external_id'].split('.')[1] + series_id = episode_data['episode_metadata']['series_slug_title'] + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', + CrunchyrollIE.ie_key(), video_id) + + app_config = self._parse_json( + self._search_regex(r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), + display_id) + client_id = app_config['cxApiParams']['accountAuthClientId'] + api_domain = app_config['cxApiParams']['apiDomain'] + basic_token = str(base64.b64encode(('%s:' % client_id).encode('ascii')), 'ascii') + auth_response = self._download_json( + f'{api_domain}/auth/v1/token', display_id, + note='Authenticating with cookie', + headers={ + 'Authorization': 'Basic ' + basic_token + }, data='grant_type=etp_rt_cookie'.encode('ascii')) + policy_response = self._download_json( + f'{api_domain}/index/v2', display_id, + note='Retrieving signed policy', + headers={ + 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] + }) + bucket = policy_response['cms']['bucket'] + params = { + 'Policy': policy_response['cms']['policy'], + 'Signature': policy_response['cms']['signature'], + 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + } + locale = traverse_obj(initial_state, ('localization', 'locale')) + if locale: + params['locale'] = locale + episode_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, + note='Retrieving episode metadata', + query=params) + if episode_response.get('is_premium_only') and not episode_response.get('playback'): + raise ExtractorError('This video is for premium members only.', expected=True) + stream_response = self._download_json( + episode_response['playback'], display_id, + note='Retrieving stream info') + + thumbnails = [] + for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): + for thumbnail_data in thumbnails_data: + thumbnails.append({ + 'url': thumbnail_data.get('source'), + 'width': thumbnail_data.get('width'), + 'height': thumbnail_data.get('height'), + }) + subtitles = {} + for lang, subtitle_data in stream_response.get('subtitles').items(): + subtitles[lang] = [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_preference = qualities(requested_hardsubs[::-1]) + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + + formats = [] + for stream_type, streams in stream_response.get('streams', {}).items(): + if stream_type not in requested_formats: + continue + for stream in streams.values(): + hardsub_lang = stream.get('hardsub_locale') or '' + if hardsub_lang.lower() not in requested_hardsubs: + continue + format_id = join_nonempty( + stream_type, + format_field(stream, 'hardsub_locale', 'hardsub-%s')) + if not stream.get('url'): + continue + if stream_type.split('_')[-1] == 'hls': + adaptive_formats = self._extract_m3u8_formats( + stream['url'], display_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_type.split('_')[-1] == 'dash': + adaptive_formats = self._extract_mpd_formats( + stream['url'], display_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + self._sort_formats(formats) + + return { + 'id': internal_id, + 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'duration': float_or_none(episode_response.get('duration_ms'), 1000), + 'thumbnails': thumbnails, + 'series': episode_response.get('series_title'), + 'series_id': episode_response.get('series_id'), + 'season': episode_response.get('season_title'), + 'season_id': episode_response.get('season_id'), + 'season_number': episode_response.get('season_number'), + 'episode': episode_response.get('title'), + 'episode_number': episode_response.get('sequence_number'), + 'subtitles': subtitles, + 'formats': formats + } class CrunchyrollBetaShowIE(CrunchyrollBaseIE):