From a4a554a79354981fcab55de8eaab7b95a40bbb48 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 16 Feb 2017 23:42:36 +0800 Subject: [PATCH] [generic] Try parsing JWPlayer embedded videos (closes #12030) --- ChangeLog | 6 ++ youtube_dl/extractor/archiveorg.py | 4 +- youtube_dl/extractor/common.py | 118 ++++++++++++++++++++ youtube_dl/extractor/generic.py | 20 ++++ youtube_dl/extractor/jwplatform.py | 132 +---------------------- youtube_dl/extractor/ondemandkorea.py | 4 +- youtube_dl/extractor/pornhub.py | 44 -------- youtube_dl/extractor/pornoxo.py | 4 +- youtube_dl/extractor/rentv.py | 3 +- youtube_dl/extractor/rudo.py | 4 +- youtube_dl/extractor/screencastomatic.py | 4 +- youtube_dl/extractor/sendtonews.py | 4 +- youtube_dl/extractor/thisav.py | 4 +- youtube_dl/extractor/tvnoe.py | 4 +- youtube_dl/extractor/vidzi.py | 4 +- youtube_dl/extractor/wimp.py | 4 +- 16 files changed, 166 insertions(+), 197 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8ef8a83072..4e69b03d0e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors ++ [generic] Support complex JWPlayer embedded videos (#12030) + + version 2017.02.16 Core diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 486dff82d0..e21045bed9 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( unified_strdate, clean_html, ) -class ArchiveOrgIE(JWPlatformBaseIE): +class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9681453ca3..f6ff56eda7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -40,6 +40,7 @@ fix_xml_ampersands, float_or_none, int_or_none, + js_to_json, parse_iso8601, RegexNotFoundError, sanitize_filename, @@ -2073,6 +2074,123 @@ def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native }) return formats + @staticmethod + def _find_jwplayer_data(webpage): + mobj = re.search( + r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', + webpage) + if mobj: + return mobj.group('options') + + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + jwplayer_data = self._parse_json( + self._find_jwplayer_data(webpage), video_id, + transform_source=js_to_json) + return self._parse_jwplayer_data( + jwplayer_data, video_id, *args, **kwargs) + + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + # JWPlayer backward compatibility: flattened playlists + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if 'playlist' not in jwplayer_data: + jwplayer_data = {'playlist': [jwplayer_data]} + + entries = [] + + # JWPlayer backward compatibility: single playlist item + # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 + if not isinstance(jwplayer_data['playlist'], list): + jwplayer_data['playlist'] = [jwplayer_data['playlist']] + + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] + + this_video_id = video_id or video_data['mediaid'] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, this_video_id, mpd_id=mpd_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('kind') != 'captions': + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) + + entries.append({ + 'id': this_video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + }) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a2b0298ecb..3db31debea 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -20,6 +20,7 @@ float_or_none, HEADRequest, is_html, + js_to_json, orderedSet, sanitized_Request, smuggle_url, @@ -961,6 +962,16 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + # Complex jwplayer + { + 'url': 'http://www.indiedb.com/games/king-machine/videos', + 'info_dict': { + 'id': 'videos', + 'ext': 'mp4', + 'title': 'king machine trailer 1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', @@ -2488,6 +2499,15 @@ def _playlist_from_matches(matches, getter=None, ie=None): self._sort_formats(entry['formats']) return self.playlist_result(entries) + jwplayer_data_str = self._find_jwplayer_data(webpage) + if jwplayer_data_str: + try: + jwplayer_data = self._parse_json( + jwplayer_data_str, video_id, transform_source=js_to_json) + return self._parse_jwplayer_data(jwplayer_data, video_id) + except ExtractorError: + pass + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index aff7ab49a9..33d55f7706 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,139 +4,9 @@ import re from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - js_to_json, - mimetype2ext, - urljoin, -) -class JWPlatformBaseIE(InfoExtractor): - @staticmethod - def _find_jwplayer_data(webpage): - # TODO: Merge this with JWPlayer-related codes in generic.py - - mobj = re.search( - r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', - webpage) - if mobj: - return mobj.group('options') - - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._parse_json( - self._find_jwplayer_data(webpage), video_id, - transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) - - def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - - entries = [] - - # JWPlayer backward compatibility: single playlist item - # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] - - for video_data in jwplayer_data['playlist']: - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - this_video_id = video_id or video_data['mediaid'] - - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, this_video_id, mpd_id=mpd_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like 1080p. - height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - self._sort_formats(formats) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if track.get('kind') != 'captions': - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) - - entries.append({ - 'id': this_video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), - 'subtitles': subtitles, - 'formats': formats, - }) - if len(entries) == 1: - return entries[0] - else: - return self.playlist_result(entries) - - -class JWPlatformIE(JWPlatformBaseIE): +class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TEST = { 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py index de1d6b08a4..dcd1577775 100644 --- a/youtube_dl/extractor/ondemandkorea.py +++ b/youtube_dl/extractor/ondemandkorea.py @@ -1,14 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( ExtractorError, js_to_json, ) -class OnDemandKoreaIE(JWPlatformBaseIE): +class OnDemandKoreaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7a2737032f..9b413590a4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -169,50 +169,6 @@ def dl_webpage(platform): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - """ - video_variables = {} - for video_variablename, quote, video_variable in re.findall( - r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage): - video_variables[video_variablename] = video_variable - - video_urls = [] - for encoded_video_url in re.findall( - r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage): - for varname, varval in video_variables.items(): - encoded_video_url = encoded_video_url.replace(varname, varval) - video_urls.append(re.sub(r'[\s+]', '', encoded_video_url)) - - if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse_unquote_plus( - self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) - - formats = [] - for video_url in video_urls: - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) - - m = re.match(r'^(?P[0-9]+)[pP]-(?P[0-9]+)[kK]$', format) - if m is None: - height = None - tbr = None - else: - height = int(m.group('height')) - tbr = int(m.group('tbr')) - - formats.append({ - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'tbr': tbr, - 'height': height, - }) - self._sort_formats(formats) - """ - page_params = self._parse_json(self._search_regex( r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', webpage, 'page parameters', group='data', default='{}'), diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py index 1a0cce7e02..2831368b6a 100644 --- a/youtube_dl/extractor/pornoxo.py +++ b/youtube_dl/extractor/pornoxo.py @@ -2,13 +2,13 @@ import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( str_to_int, ) -class PornoXOIE(JWPlatformBaseIE): +class PornoXOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P\d+)/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 422c02cff3..d338b3a933 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -2,11 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .jwplatform import JWPlatformBaseIE from ..compat import compat_str -class RENTVIE(JWPlatformBaseIE): +class RENTVIE(InfoExtractor): _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P\d+)' _TESTS = [{ 'url': 'http://ren.tv/video/epizod/118577', diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py index 3bfe934d82..51644011e5 100644 --- a/youtube_dl/extractor/rudo.py +++ b/youtube_dl/extractor/rudo.py @@ -3,7 +3,7 @@ import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( js_to_json, get_element_by_class, @@ -11,7 +11,7 @@ ) -class RudoIE(JWPlatformBaseIE): +class RudoIE(InfoExtractor): _VALID_URL = r'https?://rudo\.video/vod/(?P[0-9a-zA-Z]+)' _TEST = { diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 94a2a37d20..b5e76c9af0 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -1,11 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import js_to_json -class ScreencastOMaticIE(JWPlatformBaseIE): +class ScreencastOMaticIE(InfoExtractor): _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' _TEST = { 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 9880a5a78c..9d9652949b 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -3,7 +3,7 @@ import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( float_or_none, parse_iso8601, @@ -14,7 +14,7 @@ ) -class SendtoNewsIE(JWPlatformBaseIE): +class SendtoNewsIE(InfoExtractor): _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' _TEST = { diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py index 4473a3c773..b7b3568cb3 100644 --- a/youtube_dl/extractor/thisav.py +++ b/youtube_dl/extractor/thisav.py @@ -3,11 +3,11 @@ import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import remove_end -class ThisAVIE(JWPlatformBaseIE): +class ThisAVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P[0-9]+)/.*' _TESTS = [{ 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 6d5c748264..1a5b76bf2e 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( clean_html, get_element_by_class, @@ -9,7 +9,7 @@ ) -class TVNoeIE(JWPlatformBaseIE): +class TVNoeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P[0-9]+)' _TEST = { 'url': 'http://www.tvnoe.cz/video/10362', diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9950c62ad6..1f1828fce4 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -3,7 +3,7 @@ import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( decode_packed_codes, js_to_json, @@ -12,7 +12,7 @@ ) -class VidziIE(JWPlatformBaseIE): +class VidziIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 54eb514279..c022fb33e9 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals +from .common import InfoExtractor from .youtube import YoutubeIE -from .jwplatform import JWPlatformBaseIE -class WimpIE(JWPlatformBaseIE): +class WimpIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maru-is-exhausted/',